encoder/cppspmd_math.h - external/github.com/BinomialLLC/basis_universal - Git at Google

 // Do not include this header directly.
 //
 // Copyright 2020-2021 Binomial LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // The general goal of these vectorized estimated math functions is scalability/performance.
 // There are explictly no checks NaN's/Inf's on the input arguments. There are no assertions either.
 // These are fast estimate functions - if you need more than that, use stdlib. Please do a proper
 // engineering analysis before relying on them.
 // I have chosen functions written by others, ported them to CppSPMD, then measured their abs/rel errors.
 // I compared each to the ones in DirectXMath and stdlib's for accuracy/performance.

 CPPSPMD_FORCE_INLINE vfloat fmod_inv(const vfloat& a, const vfloat& b, const vfloat& b_inv)
 {
 	vfloat c = frac(abs(a * b_inv)) * abs(b);
 	return spmd_ternaryf(a < 0, -c, c);
 }

 CPPSPMD_FORCE_INLINE vfloat fmod_inv_p(const vfloat& a, const vfloat& b, const vfloat& b_inv)
 {
 	return frac(a * b_inv) * b;
 }

 // Avoids dividing by zero or very small values.
 CPPSPMD_FORCE_INLINE vfloat safe_div(vfloat a, vfloat b, float fDivThresh = 1e-7f)
 {
 	return a / spmd_ternaryf( abs(b) > fDivThresh, b, spmd_ternaryf(b < 0.0f, -fDivThresh, fDivThresh) );
 }

 /*
 	clang 9.0.0 for win /fp:precise release
 	f range: 0.0000000000001250 10000000000.0000000000000000, vals: 1073741824

 	log2_est():
 	max abs err: 0.0000023076808731
 	max rel err: 0.0000000756678881
 	avg abs err: 0.0000007535452724
 	avg rel err: 0.0000000235117843

 	XMVectorLog2():
 	max abs err: 0.0000023329709933
 	max rel err: 0.0000000826961046
 	avg abs err: 0.0000007564889684
 	avg rel err: 0.0000000236051899

 	std::log2f():
 	max abs err: 0.0000020265979401
 	max rel err: 0.0000000626647654
 	avg abs err: 0.0000007494445227
 	avg rel err: 0.0000000233800985
 */

 // See https://tech.ebayinc.com/engineering/fast-approximate-logarithms-part-iii-the-formulas/
 inline vfloat spmd_kernel::log2_est(vfloat v)
 {
 	vfloat signif, fexp;

 	// Just clamp to a very small value, instead of checking for invalid inputs.
 	vfloat x = max(v, 2.2e-38f);

 	/*
 	 * Assume IEEE representation, which is sgn(1):exp(8):frac(23)
 	 * representing (1+frac)*2^(exp-127).  Call 1+frac the significand
 	 */

 	 // get exponent
 	vint ux1_i = cast_vfloat_to_vint(x);

 	vint exp = VUINT_SHIFT_RIGHT(ux1_i & 0x7F800000, 23);

 	// actual exponent is exp-127, will subtract 127 later

 	vint ux2_i;
 	vfloat ux2_f;

 	vint greater = ux1_i & 0x00400000;  // true if signif > 1.5
 	SPMD_SIF(greater != 0)
 	{
 		// signif >= 1.5 so need to divide by 2.  Accomplish this by stuffing exp = 126 which corresponds to an exponent of -1
 		store_all(ux2_i, (ux1_i & 0x007FFFFF) | 0x3f000000);

 		store_all(ux2_f, cast_vint_to_vfloat(ux2_i));

 		// 126 instead of 127 compensates for division by 2
 		store_all(fexp, vfloat(exp - 126));
 	}
 	SPMD_SELSE(greater != 0)
 	{
 		// get signif by stuffing exp = 127 which corresponds to an exponent of 0
 		store(ux2_i, (ux1_i & 0x007FFFFF) | 0x3f800000);

 		store(ux2_f, cast_vint_to_vfloat(ux2_i));

 		store(fexp, vfloat(exp - 127));
 	}
 	SPMD_SENDIF

 	store_all(signif, ux2_f);
 	store_all(signif, signif - 1.0f);

 	const float a = 0.1501692f, b = 3.4226132f, c = 5.0225057f, d = 4.1130283f, e = 3.4813372f;

 	vfloat xm1 = signif;
 	vfloat xm1sqr = xm1 * xm1;

 	return fexp + ((a * (xm1sqr * xm1) + b * xm1sqr + c * xm1) / (xm1sqr + d * xm1 + e));

 	// fma lowers accuracy for SSE4.1 - no idea why (compiler reordering?)
 	//return fexp + ((vfma(a, (xm1sqr * xm1), vfma(b, xm1sqr, c * xm1))) / (xm1sqr + vfma(d, xm1, e)));
 }

 // Uses log2_est(), so this function must be <= the precision of that.
 inline vfloat spmd_kernel::log_est(vfloat v)
 {
 	return log2_est(v) * 0.693147181f;
 }

 CPPSPMD_FORCE_INLINE void spmd_kernel::reduce_expb(vfloat& arg, vfloat& two_int_a, vint& adjustment)
 {
 	// Assume we're using equation (2)
 	store_all(adjustment, 0);

 	// integer part of the input argument
 	vint int_arg = (vint)arg;

 	// if frac(arg) is in [0.5, 1.0]...
 	SPMD_SIF((arg - int_arg) > 0.5f)
 	{
 		store(adjustment, 1);

 		// then change it to [0.0, 0.5]
 		store(arg, arg - 0.5f);
 	}
 	SPMD_SENDIF

 	// arg == just the fractional part
 	store_all(arg, arg - (vfloat)int_arg);

 	// Now compute 2** (int) arg.
 	store_all(int_arg, min(int_arg + 127, 254));

 	store_all(two_int_a, cast_vint_to_vfloat(VINT_SHIFT_LEFT(int_arg, 23)));
 }

 /*
 	clang 9.0.0 for win /fp:precise release
 	f range : -50.0000000000000000 49.9999940395355225, vals : 16777216

 	exp2_est():
 	Total passed near - zero check : 16777216
 	Total sign diffs : 0
 	max abs err: 1668910609.7500000000000000
 	max rel err: 0.0000015642030031
 	avg abs err: 10793794.4007573910057545
 	avg rel err: 0.0000003890893282

 	XMVectorExp2():
 	Total passed near-zero check: 16777216
 	Total sign diffs: 0
 	max abs err: 1665552836.8750000000000000
 	max rel err: 0.0000114674862370
 	avg abs err: 10771868.2627860084176064
 	avg rel err: 0.0000011218880770

 	std::exp2f():
 	Total passed near-zero check: 16777216
 	Total sign diffs: 0
 	max abs err: 1591636585.6250000000000000
 	max rel err: 0.0000014849731018
 	avg abs err: 10775800.3204844966530800
 	avg rel err: 0.0000003851496422
 */

 // http://www.ganssle.com/item/approximations-c-code-exponentiation-log.htm
 inline vfloat spmd_kernel::exp2_est(vfloat arg)
 {
 	SPMD_BEGIN_CALL

 	const vfloat P00 = +7.2152891521493f;
 	const vfloat P01 = +0.0576900723731f;
 	const vfloat Q00 = +20.8189237930062f;
 	const vfloat Q01 = +1.0f;
 	const vfloat sqrt2 = 1.4142135623730950488f; // sqrt(2) for scaling

 	vfloat result = 0.0f;

 	// Return 0 if arg is too large.
 	// We're not introducing inf/nan's into calculations, or risk doing so by returning huge default values.
 	SPMD_IF(abs(arg) > 126.0f)
 	{
 		spmd_return();
 	}
 	SPMD_END_IF

 	// 2**(int(a))
 	vfloat two_int_a;

 	// set to 1 by reduce_expb
 	vint adjustment;

 	// 0 if arg is +; 1 if negative
 	vint negative = 0;

 	// If the input is negative, invert it. At the end we'll take the reciprocal, since n**(-1) = 1/(n**x).
 	SPMD_SIF(arg < 0.0f)
 	{
 		store(arg, -arg);
 		store(negative, 1);
 	}
 	SPMD_SENDIF

 	store_all(arg, min(arg, 126.0f));

 	// reduce to [0.0, 0.5]
 	reduce_expb(arg, two_int_a, adjustment);

 	// The format of the polynomial is:
 	//  answer=(Q(x**2) + x*P(x**2))/(Q(x**2) - x*P(x**2))
 	//
 	//  The following computes the polynomial in several steps:

 	// Q(x**2)
 	vfloat Q = vfma(Q01, (arg * arg), Q00);

 	// x*P(x**2)
 	vfloat x_P = arg * (vfma(P01, arg * arg, P00));

 	vfloat answer = (Q + x_P) / (Q - x_P);

 	// Now correct for the scaling factor of 2**(int(a))
 	store_all(answer, answer * two_int_a);

 	// If the result had a fractional part > 0.5, correct for that
 	store_all(answer, spmd_ternaryf(adjustment != 0, answer * sqrt2, answer));

 	// Correct for a negative input
 	SPMD_SIF(negative != 0)
 	{
 		store(answer, 1.0f / answer);
 	}
 	SPMD_SENDIF

 	store(result, answer);

 	return result;
 }

 inline vfloat spmd_kernel::exp_est(vfloat arg)
 {
 	// e^x = exp2(x / log_base_e(2))
 	// constant is 1.0/(log(2)/log(e)) or 1/log(2)
 	return exp2_est(arg * 1.44269504f);
 }

 inline vfloat spmd_kernel::pow_est(vfloat arg1, vfloat arg2)
 {
 	return exp_est(log_est(arg1) * arg2);
 }

 /*
 	clang 9.0.0 for win /fp:precise release
 	Total near-zero: 144, output above near-zero tresh: 30
 	Total near-zero avg: 0.0000067941016621 max: 0.0000134706497192
 	Total near-zero sign diffs: 5
 	Total passed near-zero check: 16777072
 	Total sign diffs: 5
 	max abs err: 0.0000031375306036
 	max rel err: 0.1140846017075028
 	avg abs err: 0.0000003026226621
 	avg rel err: 0.0000033564977623
 */

 // Math from this web page: http://developer.download.nvidia.com/cg/sin.html
 // This is ~2x slower than sin_est() or cos_est(), and less accurate, but I'm keeping it here for comparison purposes to help validate/sanity check sin_est() and cos_est().
 inline vfloat spmd_kernel::sincos_est_a(vfloat a, bool sin_flag)
 {
 	const float c0_x = 0.0f, c0_y = 0.5f, c0_z = 1.0f;
 	const float c1_x = 0.25f, c1_y = -9.0f, c1_z = 0.75f, c1_w = 0.159154943091f;
 	const float c2_x = 24.9808039603f, c2_y = -24.9808039603f, c2_z = -60.1458091736f, c2_w = 60.1458091736f;
 	const float c3_x = 85.4537887573f, c3_y = -85.4537887573f, c3_z = -64.9393539429f, c3_w = 64.9393539429f;
 	const float c4_x = 19.7392082214f, c4_y = -19.7392082214f, c4_z = -1.0f, c4_w = 1.0f;

 	vfloat r0_x, r0_y, r0_z, r1_x, r1_y, r1_z, r2_x, r2_y, r2_z;

 	store_all(r1_x, sin_flag ? vfms(c1_w, a, c1_x) : c1_w * a);

 	store_all(r1_y, frac(r1_x));

 	store_all(r2_x, (vfloat)(r1_y < c1_x));

 	store_all(r2_y, (vfloat)(r1_y >= c1_y));
 	store_all(r2_z, (vfloat)(r1_y >= c1_z));

 	store_all(r2_y, vfma(r2_x, c4_z, vfma(r2_y, c4_w, r2_z * c4_z)));

 	store_all(r0_x, c0_x - r1_y);
 	store_all(r0_y, c0_y - r1_y);
 	store_all(r0_z, c0_z - r1_y);

 	store_all(r0_x, r0_x * r0_x);
 	store_all(r0_y, r0_y * r0_y);
 	store_all(r0_z, r0_z * r0_z);

 	store_all(r1_x, vfma(c2_x, r0_x, c2_z));
 	store_all(r1_y, vfma(c2_y, r0_y, c2_w));
 	store_all(r1_z, vfma(c2_x, r0_z, c2_z));

 	store_all(r1_x, vfma(r1_x, r0_x, c3_x));
 	store_all(r1_y, vfma(r1_y, r0_y, c3_y));
 	store_all(r1_z, vfma(r1_z, r0_z, c3_x));

 	store_all(r1_x, vfma(r1_x, r0_x, c3_z));
 	store_all(r1_y, vfma(r1_y, r0_y, c3_w));
 	store_all(r1_z, vfma(r1_z, r0_z, c3_z));

 	store_all(r1_x, vfma(r1_x, r0_x, c4_x));
 	store_all(r1_y, vfma(r1_y, r0_y, c4_y));
 	store_all(r1_z, vfma(r1_z, r0_z, c4_x));

 	store_all(r1_x, vfma(r1_x, r0_x, c4_z));
 	store_all(r1_y, vfma(r1_y, r0_y, c4_w));
 	store_all(r1_z, vfma(r1_z, r0_z, c4_z));

 	store_all(r0_x, vfnma(r1_x, r2_x, vfnma(r1_y, r2_y, r1_z * -r2_z)));

 	return r0_x;
 }

 // positive values only
 CPPSPMD_FORCE_INLINE vfloat spmd_kernel::recip_est1(const vfloat& q)
 {
 	//const int mag = 0x7EF312AC; // 2 NR iters, 3 is  0x7EEEEBB3
 	const int mag = 0x7EF311C3;
 	const float fMinThresh = .0000125f;

 	vfloat l = spmd_ternaryf(q >= fMinThresh, q, cast_vint_to_vfloat(vint(mag)));

 	vint x_l = vint(mag) - cast_vfloat_to_vint(l);

 	vfloat rcp_l = cast_vint_to_vfloat(x_l);

 	return rcp_l * vfnma(rcp_l, q, 2.0f);
 }

 CPPSPMD_FORCE_INLINE vfloat spmd_kernel::recip_est1_pn(const vfloat& t)
 {
 	//const int mag = 0x7EF312AC; // 2 NR iters, 3 is  0x7EEEEBB3
 	const int mag = 0x7EF311C3;
 	const float fMinThresh = .0000125f;

 	vfloat s = sign(t);
 	vfloat q = abs(t);

 	vfloat l = spmd_ternaryf(q >= fMinThresh, q, cast_vint_to_vfloat(vint(mag)));

 	vint x_l = vint(mag) - cast_vfloat_to_vint(l);

 	vfloat rcp_l = cast_vint_to_vfloat(x_l);

 	return rcp_l * vfnma(rcp_l, q, 2.0f) * s;
 }

 // https://basesandframes.files.wordpress.com/2020/04/even_faster_math_functions_green_2020.pdf
 // https://github.com/hcs0/Hackers-Delight/blob/master/rsqrt.c.txt
 CPPSPMD_FORCE_INLINE vfloat spmd_kernel::rsqrt_est1(vfloat x0)
 {
 	vfloat xhalf = 0.5f * x0;
 	vfloat x = cast_vint_to_vfloat(vint(0x5F375A82) - (VINT_SHIFT_RIGHT(cast_vfloat_to_vint(x0), 1)));
 	return x * vfnma(xhalf * x, x, 1.5008909f);
 }

 CPPSPMD_FORCE_INLINE vfloat spmd_kernel::rsqrt_est2(vfloat x0)
 {
 	vfloat xhalf = 0.5f * x0;
 	vfloat x = cast_vint_to_vfloat(vint(0x5F37599E) - (VINT_SHIFT_RIGHT(cast_vfloat_to_vint(x0), 1)));
 	vfloat x1 = x * vfnma(xhalf * x, x, 1.5);
 	vfloat x2 = x1 * vfnma(xhalf * x1, x1, 1.5);
 	return x2;
 }

 // Math from: http://developer.download.nvidia.com/cg/atan2.html
 // TODO: Needs more validation, parameter checking.
 CPPSPMD_FORCE_INLINE vfloat spmd_kernel::atan2_est(vfloat y, vfloat x)
 {
 	vfloat t1 = abs(y);
 	vfloat t3 = abs(x);

 	vfloat t0 = max(t3, t1);
 	store_all(t1, min(t3, t1));

 	store_all(t3, t1 / t0);

 	vfloat t4 = t3 * t3;
 	store_all(t0, vfma(-0.013480470f, t4, 0.057477314f));
 	store_all(t0, vfms(t0, t4, 0.121239071f));
 	store_all(t0, vfma(t0, t4, 0.195635925f));
 	store_all(t0, vfms(t0, t4, 0.332994597f));
 	store_all(t0, vfma(t0, t4, 0.999995630f));
 	store_all(t3, t0 * t3);

 	store_all(t3, spmd_ternaryf(abs(y) > abs(x), vfloat(1.570796327f) - t3, t3));

 	store_all(t3, spmd_ternaryf(x < 0.0f, vfloat(3.141592654f) - t3, t3));
 	store_all(t3, spmd_ternaryf(y < 0.0f, -t3, t3));

 	return t3;
 }

 /*
     clang 9.0.0 for win /fp:precise release
 	Tested range: -25.1327412287183449 25.1327382326621169, vals : 16777216
 	Skipped angles near 90/270 within +- .001 radians.
 	Near-zero threshold: .0000125f
 	Near-zero output above check threshold: 1e-6f

 	Total near-zero: 144, output above near-zero tresh: 20
 	Total near-zero avg: 0.0000067510751968 max: 0.0000133514404297
 	Total near-zero sign diffs: 5
 	Total passed near-zero check: 16766400
 	Total sign diffs: 5
 	max abs err: 1.4982600811139264
 	max rel err: 0.1459155900188041
 	avg rel err: 0.0000054659502568

 	XMVectorTan() precise:
 	Total near-zero: 144, output above near-zero tresh: 18
 	Total near-zero avg: 0.0000067641216186 max: 0.0000133524126795
 	Total near-zero sign diffs: 0
 	Total passed near-zero check: 16766400
 	Total sign diffs: 0
 	max abs err: 1.9883573246424930
 	max rel err: 0.1459724171926864
 	avg rel err: 0.0000054965766843

 	std::tanf():
 	Total near-zero: 144, output above near-zero tresh: 0
 	Total near-zero avg: 0.0000067116930779 max: 0.0000127713074107
 	Total near-zero sign diffs: 11
 	Total passed near-zero check: 16766400
 	Total sign diffs: 11
 	max abs err: 0.8989131818294709
 	max rel err: 0.0573181403173166
 	avg rel err: 0.0000030791301203

 	Originally from:
 	http://www.ganssle.com/approx.htm
 */

 CPPSPMD_FORCE_INLINE vfloat spmd_kernel::tan82(vfloat x)
 {
 	// Original double version was 8.2 digits
 	//double c1 = 211.849369664121f, c2 = -12.5288887278448f, c3 = 269.7350131214121f, c4 = -71.4145309347748f;
 	// Tuned float constants for lower avg rel error (without using FMA3):
 	const float c1 = 211.849350f, c2 = -12.5288887f, c3 = 269.734985f, c4 = -71.4145203f;
 	vfloat x2 = x * x;
 	return (x * (vfma(c2, x2, c1)) / (vfma(x2, (c4 + x2), c3)));
 }

 // Don't call this for angles close to 90/270!.
 inline vfloat spmd_kernel::tan_est(vfloat x)
 {
 	const float fPi = 3.141592653589793f, fOneOverPi = 0.3183098861837907f;
 	CPPSPMD_DECL(const uint8_t, s_table0[16]) =	{ 128 + 0, 128 + 2, 128 + -2, 128 + 4,    128 + 0, 128 + 2, 128 + -2, 128 + 4,	  128 + 0, 128 + 2, 128 + -2, 128 + 4,   128 + 0, 128 + 2, 128 + -2, 128 + 4 };

 	vint table = init_lookup4(s_table0); // a load
 	vint sgn = cast_vfloat_to_vint(x) & 0x80000000;

 	store_all(x, abs(x));
 	vfloat orig_x = x;

 	vfloat q = x * fOneOverPi;
 	store_all(x, q - floor(q));

 	vfloat x4 = x * 4.0f;
 	vint octant = (vint)(x4);

 	vfloat x0 = spmd_ternaryf((octant & 1) != 0, -x4, x4);

 	vint k = table_lookup4_8(octant, table) & 0xFF; // a shuffle

 	vfloat bias = (vfloat)k + -128.0f;
 	vfloat y = x0 + bias;

 	vfloat z = tan82(y);

 	vfloat r;

 	vbool octant_one_or_two = (octant == 1) || (octant == 2);

 	// SPMD optimization - skip costly divide if we can
 	if (spmd_any(octant_one_or_two))
 	{
 		const float fDivThresh = .4371e-7f;
 		vfloat one_over_z = 1.0f / spmd_ternaryf(abs(z) > fDivThresh, z, spmd_ternaryf(z < 0.0f, -fDivThresh, fDivThresh));

 		vfloat b = spmd_ternaryf(octant_one_or_two, one_over_z, z);
 		store_all(r, spmd_ternaryf((octant & 2) != 0, -b, b));
 	}
 	else
 	{
 		store_all(r, spmd_ternaryf(octant == 0, z, -z));
 	}

 	// Small angle approximation, to decrease the max rel error near Pi.
 	SPMD_SIF(x >= (1.0f - .0003125f*4.0f))
 	{
 		store(r, vfnma(floor(q) + 1.0f, fPi, orig_x));
 	}
 	SPMD_SENDIF

 	return cast_vint_to_vfloat(cast_vfloat_to_vint(r) ^ sgn);
 }

 inline void spmd_kernel::seed_rand(rand_context& x, vint seed)
 {
 	store(x.a, 0xf1ea5eed);
 	store(x.b, seed ^ 0xd8487b1f);
 	store(x.c, seed ^ 0xdbadef9a);
 	store(x.d, seed);
 	for (int i = 0; i < 20; ++i)
 		(void)get_randu(x);
 }

 // https://burtleburtle.net/bob/rand/smallprng.html
 // Returns 32-bit unsigned random numbers.
 inline vint spmd_kernel::get_randu(rand_context& x)
 {
 	vint e = x.a - VINT_ROT(x.b, 27);
 	store(x.a, x.b ^ VINT_ROT(x.c, 17));
 	store(x.b, x.c + x.d);
 	store(x.c, x.d + e);
 	store(x.d, e + x.a);
 	return x.d;
 }

 // Returns random numbers between [low, high), or low if low >= high
 inline vint spmd_kernel::get_randi(rand_context& x, vint low, vint high)
 {
 	vint rnd = get_randu(x);

 	vint range = high - low;

 	vint rnd_range = mulhiu(rnd, range);

 	return spmd_ternaryi(low < high, low + rnd_range, low);
 }

 // Returns random numbers between [low, high), or low if low >= high
 inline vfloat spmd_kernel::get_randf(rand_context& x, vfloat low, vfloat high)
 {
 	vint rndi = get_randu(x) & 0x7fffff;

 	vfloat rnd = (vfloat)(rndi) * (1.0f / 8388608.0f);

 	return spmd_ternaryf(low < high, vfma(high - low, rnd, low), low);
 }

 CPPSPMD_FORCE_INLINE void spmd_kernel::init_reverse_bits(vint& tab1, vint& tab2)
 {
 	const uint8_t tab1_bytes[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
 	const uint8_t tab2_bytes[16] = { 0, 8 << 4, 4 << 4, 12 << 4, 2 << 4, 10 << 4, 6 << 4, 14 << 4, 1 << 4, 9 << 4, 5 << 4, 13 << 4, 3 << 4, 11 << 4, 7 << 4, 15 << 4 };
 	store_all(tab1, init_lookup4(tab1_bytes));
 	store_all(tab2, init_lookup4(tab2_bytes));
 }

 CPPSPMD_FORCE_INLINE vint spmd_kernel::reverse_bits(vint k, vint tab1, vint tab2)
 {
 	vint r0 = table_lookup4_8(k & 0x7F7F7F7F, tab2);
 	vint r1 = table_lookup4_8(VUINT_SHIFT_RIGHT(k, 4) & 0x7F7F7F7F, tab1);
 	vint r3 = r0 | r1;
 	return byteswap(r3);
 }

 CPPSPMD_FORCE_INLINE vint spmd_kernel::count_leading_zeros(vint x)
 {
 	CPPSPMD_DECL(const uint8_t, s_tab[16]) = { 0, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };

 	vint tab = init_lookup4(s_tab);

 	//x <= 0x0000ffff
 	vbool c0 = (x & 0xFFFF0000) == 0;
 	vint n0 = spmd_ternaryi(c0, 16, 0);
 	vint x0 = spmd_ternaryi(c0, VINT_SHIFT_LEFT(x, 16), x);

 	//x <= 0x00ffffff
 	vbool c1 = (x0 & 0xFF000000) == 0;
 	vint n1 = spmd_ternaryi(c1, n0 + 8, n0);
 	vint x1 = spmd_ternaryi(c1, VINT_SHIFT_LEFT(x0, 8), x0);

 	//x <= 0x0fffffff
 	vbool c2 = (x1 & 0xF0000000) == 0;
 	vint n2 = spmd_ternaryi(c2, n1 + 4, n1);
 	vint x2 = spmd_ternaryi(c2, VINT_SHIFT_LEFT(x1, 4), x1);

 	return table_lookup4_8(VUINT_SHIFT_RIGHT(x2, 28), tab) + n2;
 }

 CPPSPMD_FORCE_INLINE vint spmd_kernel::count_leading_zeros_alt(vint x)
 {
 	//x <= 0x0000ffff
 	vbool c0 = (x & 0xFFFF0000) == 0;
 	vint n0 = spmd_ternaryi(c0, 16, 0);
 	vint x0 = spmd_ternaryi(c0, VINT_SHIFT_LEFT(x, 16), x);

 	//x <= 0x00ffffff
 	vbool c1 = (x0 & 0xFF000000) == 0;
 	vint n1 = spmd_ternaryi(c1, n0 + 8, n0);
 	vint x1 = spmd_ternaryi(c1, VINT_SHIFT_LEFT(x0, 8), x0);

 	//x <= 0x0fffffff
 	vbool c2 = (x1 & 0xF0000000) == 0;
 	vint n2 = spmd_ternaryi(c2, n1 + 4, n1);
 	vint x2 = spmd_ternaryi(c2, VINT_SHIFT_LEFT(x1, 4), x1);

 	// x <= 0x3fffffff
 	vbool c3 = (x2 & 0xC0000000) == 0;
 	vint n3 = spmd_ternaryi(c3, n2 + 2, n2);
 	vint x3 = spmd_ternaryi(c3, VINT_SHIFT_LEFT(x2, 2), x2);

 	// x <= 0x7fffffff
 	vbool c4 = (x3 & 0x80000000) == 0;
 	return spmd_ternaryi(c4, n3 + 1, n3);
 }

 CPPSPMD_FORCE_INLINE vint spmd_kernel::count_trailing_zeros(vint x)
 {
 	// cast the least significant bit in v to a float
 	vfloat f = (vfloat)(x & -x);

 	// extract exponent and adjust
 	return VUINT_SHIFT_RIGHT(cast_vfloat_to_vint(f), 23) - 0x7F;
 }

 CPPSPMD_FORCE_INLINE vint spmd_kernel::count_set_bits(vint x)
 {
 	vint v = x - (VUINT_SHIFT_RIGHT(x, 1) & 0x55555555);
 	vint v1 = (v & 0x33333333) + (VUINT_SHIFT_RIGHT(v, 2) & 0x33333333);
 	return VUINT_SHIFT_RIGHT(((v1 + VUINT_SHIFT_RIGHT(v1, 4) & 0xF0F0F0F) * 0x1010101), 24);
 }

 CPPSPMD_FORCE_INLINE vint cmple_epu16(const vint &a, const vint &b)
 {
 	return cmpeq_epi16(subs_epu16(a, b), vint(0));
 }

 CPPSPMD_FORCE_INLINE vint cmpge_epu16(const vint &a, const vint &b)
 {
 	return cmple_epu16(b, a);
 }

 CPPSPMD_FORCE_INLINE vint cmpgt_epu16(const vint &a, const vint &b)
 {
 	return andnot(cmpeq_epi16(a, b), cmple_epu16(b, a));
 }

 CPPSPMD_FORCE_INLINE vint cmplt_epu16(const vint &a, const vint &b)
 {
 	return cmpgt_epu16(b, a);
 }

 CPPSPMD_FORCE_INLINE vint cmpge_epi16(const vint &a, const vint &b)
 {
 	return cmpeq_epi16(a, b) | cmpgt_epi16(a, b);
 }

 CPPSPMD_FORCE_INLINE vint cmple_epi16(const vint &a, const vint &b)
 {
 	return cmpge_epi16(b, a);
 }

 void spmd_kernel::print_vint(vint v)
 {
 	for (uint32_t i = 0; i < PROGRAM_COUNT; i++)
 		printf("%i ", extract(v, i));
 	printf("\n");
 }

 void spmd_kernel::print_vbool(vbool v)
 {
 	for (uint32_t i = 0; i < PROGRAM_COUNT; i++)
 		printf("%i ", extract(v, i) ? 1 : 0);
 	printf("\n");
 }

 void spmd_kernel::print_vint_hex(vint v)
 {
 	for (uint32_t i = 0; i < PROGRAM_COUNT; i++)
 		printf("0x%X ", extract(v, i));
 	printf("\n");
 }

 void spmd_kernel::print_active_lanes(const char *pPrefix)
 {
 	CPPSPMD_DECL(int, flags[PROGRAM_COUNT]);
 	memset(flags, 0, sizeof(flags));
 	storeu_linear(flags, vint(1));

 	if (pPrefix)
 		printf("%s", pPrefix);

 	for (uint32_t i = 0; i < PROGRAM_COUNT; i++)
 	{
 		if (flags[i])
 			printf("%u ", i);
 	}
 	printf("\n");
 }

 void spmd_kernel::print_vfloat(vfloat v)
 {
 	for (uint32_t i = 0; i < PROGRAM_COUNT; i++)
 		printf("%f ", extract(v, i));
 	printf("\n");
 }
	// Do not include this header directly.
	//
	// Copyright 2020-2021 Binomial LLC
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	// The general goal of these vectorized estimated math functions is scalability/performance.
	// There are explictly no checks NaN's/Inf's on the input arguments. There are no assertions either.
	// These are fast estimate functions - if you need more than that, use stdlib. Please do a proper
	// engineering analysis before relying on them.
	// I have chosen functions written by others, ported them to CppSPMD, then measured their abs/rel errors.
	// I compared each to the ones in DirectXMath and stdlib's for accuracy/performance.

	CPPSPMD_FORCE_INLINE vfloat fmod_inv(const vfloat& a, const vfloat& b, const vfloat& b_inv)
	{
	vfloat c = frac(abs(a * b_inv)) * abs(b);
	return spmd_ternaryf(a < 0, -c, c);
	}

	CPPSPMD_FORCE_INLINE vfloat fmod_inv_p(const vfloat& a, const vfloat& b, const vfloat& b_inv)
	{
	return frac(a * b_inv) * b;
	}

	// Avoids dividing by zero or very small values.
	CPPSPMD_FORCE_INLINE vfloat safe_div(vfloat a, vfloat b, float fDivThresh = 1e-7f)
	{
	return a / spmd_ternaryf( abs(b) > fDivThresh, b, spmd_ternaryf(b < 0.0f, -fDivThresh, fDivThresh) );
	}

	/*
	clang 9.0.0 for win /fp:precise release
	f range: 0.0000000000001250 10000000000.0000000000000000, vals: 1073741824

	log2_est():
	max abs err: 0.0000023076808731
	max rel err: 0.0000000756678881
	avg abs err: 0.0000007535452724
	avg rel err: 0.0000000235117843

	XMVectorLog2():
	max abs err: 0.0000023329709933
	max rel err: 0.0000000826961046
	avg abs err: 0.0000007564889684
	avg rel err: 0.0000000236051899

	std::log2f():
	max abs err: 0.0000020265979401
	max rel err: 0.0000000626647654
	avg abs err: 0.0000007494445227
	avg rel err: 0.0000000233800985
	*/

	// See https://tech.ebayinc.com/engineering/fast-approximate-logarithms-part-iii-the-formulas/
	inline vfloat spmd_kernel::log2_est(vfloat v)
	{
	vfloat signif, fexp;

	// Just clamp to a very small value, instead of checking for invalid inputs.
	vfloat x = max(v, 2.2e-38f);

	/*
	* Assume IEEE representation, which is sgn(1):exp(8):frac(23)
	* representing (1+frac)*2^(exp-127). Call 1+frac the significand
	*/

	// get exponent
	vint ux1_i = cast_vfloat_to_vint(x);

	vint exp = VUINT_SHIFT_RIGHT(ux1_i & 0x7F800000, 23);

	// actual exponent is exp-127, will subtract 127 later

	vint ux2_i;
	vfloat ux2_f;

	vint greater = ux1_i & 0x00400000; // true if signif > 1.5
	SPMD_SIF(greater != 0)
	{
	// signif >= 1.5 so need to divide by 2. Accomplish this by stuffing exp = 126 which corresponds to an exponent of -1
	store_all(ux2_i, (ux1_i & 0x007FFFFF) \| 0x3f000000);

	store_all(ux2_f, cast_vint_to_vfloat(ux2_i));

	// 126 instead of 127 compensates for division by 2
	store_all(fexp, vfloat(exp - 126));
	}
	SPMD_SELSE(greater != 0)
	{
	// get signif by stuffing exp = 127 which corresponds to an exponent of 0
	store(ux2_i, (ux1_i & 0x007FFFFF) \| 0x3f800000);

	store(ux2_f, cast_vint_to_vfloat(ux2_i));

	store(fexp, vfloat(exp - 127));
	}
	SPMD_SENDIF

	store_all(signif, ux2_f);
	store_all(signif, signif - 1.0f);

	const float a = 0.1501692f, b = 3.4226132f, c = 5.0225057f, d = 4.1130283f, e = 3.4813372f;

	vfloat xm1 = signif;
	vfloat xm1sqr = xm1 * xm1;

	return fexp + ((a * (xm1sqr * xm1) + b * xm1sqr + c * xm1) / (xm1sqr + d * xm1 + e));

	// fma lowers accuracy for SSE4.1 - no idea why (compiler reordering?)
	//return fexp + ((vfma(a, (xm1sqr * xm1), vfma(b, xm1sqr, c * xm1))) / (xm1sqr + vfma(d, xm1, e)));
	}

	// Uses log2_est(), so this function must be <= the precision of that.
	inline vfloat spmd_kernel::log_est(vfloat v)
	{
	return log2_est(v) * 0.693147181f;
	}

	CPPSPMD_FORCE_INLINE void spmd_kernel::reduce_expb(vfloat& arg, vfloat& two_int_a, vint& adjustment)
	{
	// Assume we're using equation (2)
	store_all(adjustment, 0);

	// integer part of the input argument
	vint int_arg = (vint)arg;

	// if frac(arg) is in [0.5, 1.0]...
	SPMD_SIF((arg - int_arg) > 0.5f)
	{
	store(adjustment, 1);

	// then change it to [0.0, 0.5]
	store(arg, arg - 0.5f);
	}
	SPMD_SENDIF

	// arg == just the fractional part
	store_all(arg, arg - (vfloat)int_arg);

	// Now compute 2** (int) arg.
	store_all(int_arg, min(int_arg + 127, 254));

	store_all(two_int_a, cast_vint_to_vfloat(VINT_SHIFT_LEFT(int_arg, 23)));
	}

	/*
	clang 9.0.0 for win /fp:precise release
	f range : -50.0000000000000000 49.9999940395355225, vals : 16777216

	exp2_est():
	Total passed near - zero check : 16777216
	Total sign diffs : 0
	max abs err: 1668910609.7500000000000000
	max rel err: 0.0000015642030031
	avg abs err: 10793794.4007573910057545
	avg rel err: 0.0000003890893282

	XMVectorExp2():
	Total passed near-zero check: 16777216
	Total sign diffs: 0
	max abs err: 1665552836.8750000000000000
	max rel err: 0.0000114674862370
	avg abs err: 10771868.2627860084176064
	avg rel err: 0.0000011218880770

	std::exp2f():
	Total passed near-zero check: 16777216
	Total sign diffs: 0
	max abs err: 1591636585.6250000000000000
	max rel err: 0.0000014849731018
	avg abs err: 10775800.3204844966530800
	avg rel err: 0.0000003851496422
	*/

	// http://www.ganssle.com/item/approximations-c-code-exponentiation-log.htm
	inline vfloat spmd_kernel::exp2_est(vfloat arg)
	{
	SPMD_BEGIN_CALL

	const vfloat P00 = +7.2152891521493f;
	const vfloat P01 = +0.0576900723731f;
	const vfloat Q00 = +20.8189237930062f;
	const vfloat Q01 = +1.0f;
	const vfloat sqrt2 = 1.4142135623730950488f; // sqrt(2) for scaling

	vfloat result = 0.0f;

	// Return 0 if arg is too large.
	// We're not introducing inf/nan's into calculations, or risk doing so by returning huge default values.
	SPMD_IF(abs(arg) > 126.0f)
	{
	spmd_return();
	}
	SPMD_END_IF

	// 2**(int(a))
	vfloat two_int_a;

	// set to 1 by reduce_expb
	vint adjustment;

	// 0 if arg is +; 1 if negative
	vint negative = 0;

	// If the input is negative, invert it. At the end we'll take the reciprocal, since n(-1) = 1/(nx).
	SPMD_SIF(arg < 0.0f)
	{
	store(arg, -arg);
	store(negative, 1);
	}
	SPMD_SENDIF

	store_all(arg, min(arg, 126.0f));

	// reduce to [0.0, 0.5]
	reduce_expb(arg, two_int_a, adjustment);

	// The format of the polynomial is:
	// answer=(Q(x*2) + xP(x2))/(Q(x2) - xP(x*2))
	//
	// The following computes the polynomial in several steps:

	// Q(x**2)
	vfloat Q = vfma(Q01, (arg * arg), Q00);

	// xP(x*2)
	vfloat x_P = arg * (vfma(P01, arg * arg, P00));

	vfloat answer = (Q + x_P) / (Q - x_P);

	// Now correct for the scaling factor of 2**(int(a))
	store_all(answer, answer * two_int_a);

	// If the result had a fractional part > 0.5, correct for that
	store_all(answer, spmd_ternaryf(adjustment != 0, answer * sqrt2, answer));

	// Correct for a negative input
	SPMD_SIF(negative != 0)
	{
	store(answer, 1.0f / answer);
	}
	SPMD_SENDIF

	store(result, answer);

	return result;
	}

	inline vfloat spmd_kernel::exp_est(vfloat arg)
	{
	// e^x = exp2(x / log_base_e(2))
	// constant is 1.0/(log(2)/log(e)) or 1/log(2)
	return exp2_est(arg * 1.44269504f);
	}

	inline vfloat spmd_kernel::pow_est(vfloat arg1, vfloat arg2)
	{
	return exp_est(log_est(arg1) * arg2);
	}

	/*
	clang 9.0.0 for win /fp:precise release
	Total near-zero: 144, output above near-zero tresh: 30
	Total near-zero avg: 0.0000067941016621 max: 0.0000134706497192
	Total near-zero sign diffs: 5
	Total passed near-zero check: 16777072
	Total sign diffs: 5
	max abs err: 0.0000031375306036
	max rel err: 0.1140846017075028
	avg abs err: 0.0000003026226621
	avg rel err: 0.0000033564977623
	*/

	// Math from this web page: http://developer.download.nvidia.com/cg/sin.html
	// This is ~2x slower than sin_est() or cos_est(), and less accurate, but I'm keeping it here for comparison purposes to help validate/sanity check sin_est() and cos_est().
	inline vfloat spmd_kernel::sincos_est_a(vfloat a, bool sin_flag)
	{
	const float c0_x = 0.0f, c0_y = 0.5f, c0_z = 1.0f;
	const float c1_x = 0.25f, c1_y = -9.0f, c1_z = 0.75f, c1_w = 0.159154943091f;
	const float c2_x = 24.9808039603f, c2_y = -24.9808039603f, c2_z = -60.1458091736f, c2_w = 60.1458091736f;
	const float c3_x = 85.4537887573f, c3_y = -85.4537887573f, c3_z = -64.9393539429f, c3_w = 64.9393539429f;
	const float c4_x = 19.7392082214f, c4_y = -19.7392082214f, c4_z = -1.0f, c4_w = 1.0f;

	vfloat r0_x, r0_y, r0_z, r1_x, r1_y, r1_z, r2_x, r2_y, r2_z;

	store_all(r1_x, sin_flag ? vfms(c1_w, a, c1_x) : c1_w * a);

	store_all(r1_y, frac(r1_x));

	store_all(r2_x, (vfloat)(r1_y < c1_x));

	store_all(r2_y, (vfloat)(r1_y >= c1_y));
	store_all(r2_z, (vfloat)(r1_y >= c1_z));

	store_all(r2_y, vfma(r2_x, c4_z, vfma(r2_y, c4_w, r2_z * c4_z)));

	store_all(r0_x, c0_x - r1_y);
	store_all(r0_y, c0_y - r1_y);
	store_all(r0_z, c0_z - r1_y);

	store_all(r0_x, r0_x * r0_x);
	store_all(r0_y, r0_y * r0_y);
	store_all(r0_z, r0_z * r0_z);

	store_all(r1_x, vfma(c2_x, r0_x, c2_z));
	store_all(r1_y, vfma(c2_y, r0_y, c2_w));
	store_all(r1_z, vfma(c2_x, r0_z, c2_z));

	store_all(r1_x, vfma(r1_x, r0_x, c3_x));
	store_all(r1_y, vfma(r1_y, r0_y, c3_y));
	store_all(r1_z, vfma(r1_z, r0_z, c3_x));

	store_all(r1_x, vfma(r1_x, r0_x, c3_z));
	store_all(r1_y, vfma(r1_y, r0_y, c3_w));
	store_all(r1_z, vfma(r1_z, r0_z, c3_z));

	store_all(r1_x, vfma(r1_x, r0_x, c4_x));
	store_all(r1_y, vfma(r1_y, r0_y, c4_y));
	store_all(r1_z, vfma(r1_z, r0_z, c4_x));

	store_all(r1_x, vfma(r1_x, r0_x, c4_z));
	store_all(r1_y, vfma(r1_y, r0_y, c4_w));
	store_all(r1_z, vfma(r1_z, r0_z, c4_z));

	store_all(r0_x, vfnma(r1_x, r2_x, vfnma(r1_y, r2_y, r1_z * -r2_z)));

	return r0_x;
	}

	// positive values only
	CPPSPMD_FORCE_INLINE vfloat spmd_kernel::recip_est1(const vfloat& q)
	{
	//const int mag = 0x7EF312AC; // 2 NR iters, 3 is 0x7EEEEBB3
	const int mag = 0x7EF311C3;
	const float fMinThresh = .0000125f;

	vfloat l = spmd_ternaryf(q >= fMinThresh, q, cast_vint_to_vfloat(vint(mag)));

	vint x_l = vint(mag) - cast_vfloat_to_vint(l);

	vfloat rcp_l = cast_vint_to_vfloat(x_l);

	return rcp_l * vfnma(rcp_l, q, 2.0f);
	}

	CPPSPMD_FORCE_INLINE vfloat spmd_kernel::recip_est1_pn(const vfloat& t)
	{
	//const int mag = 0x7EF312AC; // 2 NR iters, 3 is 0x7EEEEBB3
	const int mag = 0x7EF311C3;
	const float fMinThresh = .0000125f;

	vfloat s = sign(t);
	vfloat q = abs(t);

	vfloat l = spmd_ternaryf(q >= fMinThresh, q, cast_vint_to_vfloat(vint(mag)));

	vint x_l = vint(mag) - cast_vfloat_to_vint(l);

	vfloat rcp_l = cast_vint_to_vfloat(x_l);

	return rcp_l * vfnma(rcp_l, q, 2.0f) * s;
	}

	// https://basesandframes.files.wordpress.com/2020/04/even_faster_math_functions_green_2020.pdf
	// https://github.com/hcs0/Hackers-Delight/blob/master/rsqrt.c.txt
	CPPSPMD_FORCE_INLINE vfloat spmd_kernel::rsqrt_est1(vfloat x0)
	{
	vfloat xhalf = 0.5f * x0;
	vfloat x = cast_vint_to_vfloat(vint(0x5F375A82) - (VINT_SHIFT_RIGHT(cast_vfloat_to_vint(x0), 1)));
	return x * vfnma(xhalf * x, x, 1.5008909f);
	}

	CPPSPMD_FORCE_INLINE vfloat spmd_kernel::rsqrt_est2(vfloat x0)
	{
	vfloat xhalf = 0.5f * x0;
	vfloat x = cast_vint_to_vfloat(vint(0x5F37599E) - (VINT_SHIFT_RIGHT(cast_vfloat_to_vint(x0), 1)));
	vfloat x1 = x * vfnma(xhalf * x, x, 1.5);
	vfloat x2 = x1 * vfnma(xhalf * x1, x1, 1.5);
	return x2;
	}

	// Math from: http://developer.download.nvidia.com/cg/atan2.html
	// TODO: Needs more validation, parameter checking.
	CPPSPMD_FORCE_INLINE vfloat spmd_kernel::atan2_est(vfloat y, vfloat x)
	{
	vfloat t1 = abs(y);
	vfloat t3 = abs(x);

	vfloat t0 = max(t3, t1);
	store_all(t1, min(t3, t1));

	store_all(t3, t1 / t0);

	vfloat t4 = t3 * t3;
	store_all(t0, vfma(-0.013480470f, t4, 0.057477314f));
	store_all(t0, vfms(t0, t4, 0.121239071f));
	store_all(t0, vfma(t0, t4, 0.195635925f));
	store_all(t0, vfms(t0, t4, 0.332994597f));
	store_all(t0, vfma(t0, t4, 0.999995630f));
	store_all(t3, t0 * t3);

	store_all(t3, spmd_ternaryf(abs(y) > abs(x), vfloat(1.570796327f) - t3, t3));

	store_all(t3, spmd_ternaryf(x < 0.0f, vfloat(3.141592654f) - t3, t3));
	store_all(t3, spmd_ternaryf(y < 0.0f, -t3, t3));

	return t3;
	}

	/*
	clang 9.0.0 for win /fp:precise release
	Tested range: -25.1327412287183449 25.1327382326621169, vals : 16777216
	Skipped angles near 90/270 within +- .001 radians.
	Near-zero threshold: .0000125f
	Near-zero output above check threshold: 1e-6f

	Total near-zero: 144, output above near-zero tresh: 20
	Total near-zero avg: 0.0000067510751968 max: 0.0000133514404297
	Total near-zero sign diffs: 5
	Total passed near-zero check: 16766400
	Total sign diffs: 5
	max abs err: 1.4982600811139264
	max rel err: 0.1459155900188041
	avg rel err: 0.0000054659502568

	XMVectorTan() precise:
	Total near-zero: 144, output above near-zero tresh: 18
	Total near-zero avg: 0.0000067641216186 max: 0.0000133524126795
	Total near-zero sign diffs: 0
	Total passed near-zero check: 16766400
	Total sign diffs: 0
	max abs err: 1.9883573246424930
	max rel err: 0.1459724171926864
	avg rel err: 0.0000054965766843

	std::tanf():
	Total near-zero: 144, output above near-zero tresh: 0
	Total near-zero avg: 0.0000067116930779 max: 0.0000127713074107
	Total near-zero sign diffs: 11
	Total passed near-zero check: 16766400
	Total sign diffs: 11
	max abs err: 0.8989131818294709
	max rel err: 0.0573181403173166
	avg rel err: 0.0000030791301203

	Originally from:
	http://www.ganssle.com/approx.htm
	*/

	CPPSPMD_FORCE_INLINE vfloat spmd_kernel::tan82(vfloat x)
	{
	// Original double version was 8.2 digits
	//double c1 = 211.849369664121f, c2 = -12.5288887278448f, c3 = 269.7350131214121f, c4 = -71.4145309347748f;
	// Tuned float constants for lower avg rel error (without using FMA3):
	const float c1 = 211.849350f, c2 = -12.5288887f, c3 = 269.734985f, c4 = -71.4145203f;
	vfloat x2 = x * x;
	return (x * (vfma(c2, x2, c1)) / (vfma(x2, (c4 + x2), c3)));
	}

	// Don't call this for angles close to 90/270!.
	inline vfloat spmd_kernel::tan_est(vfloat x)
	{
	const float fPi = 3.141592653589793f, fOneOverPi = 0.3183098861837907f;
	CPPSPMD_DECL(const uint8_t, s_table0[16]) = { 128 + 0, 128 + 2, 128 + -2, 128 + 4, 128 + 0, 128 + 2, 128 + -2, 128 + 4, 128 + 0, 128 + 2, 128 + -2, 128 + 4, 128 + 0, 128 + 2, 128 + -2, 128 + 4 };

	vint table = init_lookup4(s_table0); // a load
	vint sgn = cast_vfloat_to_vint(x) & 0x80000000;

	store_all(x, abs(x));
	vfloat orig_x = x;

	vfloat q = x * fOneOverPi;
	store_all(x, q - floor(q));

	vfloat x4 = x * 4.0f;
	vint octant = (vint)(x4);

	vfloat x0 = spmd_ternaryf((octant & 1) != 0, -x4, x4);

	vint k = table_lookup4_8(octant, table) & 0xFF; // a shuffle

	vfloat bias = (vfloat)k + -128.0f;
	vfloat y = x0 + bias;

	vfloat z = tan82(y);

	vfloat r;

	vbool octant_one_or_two = (octant == 1) \|\| (octant == 2);

	// SPMD optimization - skip costly divide if we can
	if (spmd_any(octant_one_or_two))
	{
	const float fDivThresh = .4371e-7f;
	vfloat one_over_z = 1.0f / spmd_ternaryf(abs(z) > fDivThresh, z, spmd_ternaryf(z < 0.0f, -fDivThresh, fDivThresh));

	vfloat b = spmd_ternaryf(octant_one_or_two, one_over_z, z);
	store_all(r, spmd_ternaryf((octant & 2) != 0, -b, b));
	}
	else
	{
	store_all(r, spmd_ternaryf(octant == 0, z, -z));
	}

	// Small angle approximation, to decrease the max rel error near Pi.
	SPMD_SIF(x >= (1.0f - .0003125f*4.0f))
	{
	store(r, vfnma(floor(q) + 1.0f, fPi, orig_x));
	}
	SPMD_SENDIF

	return cast_vint_to_vfloat(cast_vfloat_to_vint(r) ^ sgn);
	}

	inline void spmd_kernel::seed_rand(rand_context& x, vint seed)
	{
	store(x.a, 0xf1ea5eed);
	store(x.b, seed ^ 0xd8487b1f);
	store(x.c, seed ^ 0xdbadef9a);
	store(x.d, seed);
	for (int i = 0; i < 20; ++i)
	(void)get_randu(x);
	}

	// https://burtleburtle.net/bob/rand/smallprng.html
	// Returns 32-bit unsigned random numbers.
	inline vint spmd_kernel::get_randu(rand_context& x)
	{
	vint e = x.a - VINT_ROT(x.b, 27);
	store(x.a, x.b ^ VINT_ROT(x.c, 17));
	store(x.b, x.c + x.d);
	store(x.c, x.d + e);
	store(x.d, e + x.a);
	return x.d;
	}

	// Returns random numbers between [low, high), or low if low >= high
	inline vint spmd_kernel::get_randi(rand_context& x, vint low, vint high)
	{
	vint rnd = get_randu(x);

	vint range = high - low;

	vint rnd_range = mulhiu(rnd, range);

	return spmd_ternaryi(low < high, low + rnd_range, low);
	}

	// Returns random numbers between [low, high), or low if low >= high
	inline vfloat spmd_kernel::get_randf(rand_context& x, vfloat low, vfloat high)
	{
	vint rndi = get_randu(x) & 0x7fffff;

	vfloat rnd = (vfloat)(rndi) * (1.0f / 8388608.0f);

	return spmd_ternaryf(low < high, vfma(high - low, rnd, low), low);
	}

	CPPSPMD_FORCE_INLINE void spmd_kernel::init_reverse_bits(vint& tab1, vint& tab2)
	{
	const uint8_t tab1_bytes[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
	const uint8_t tab2_bytes[16] = { 0, 8 << 4, 4 << 4, 12 << 4, 2 << 4, 10 << 4, 6 << 4, 14 << 4, 1 << 4, 9 << 4, 5 << 4, 13 << 4, 3 << 4, 11 << 4, 7 << 4, 15 << 4 };
	store_all(tab1, init_lookup4(tab1_bytes));
	store_all(tab2, init_lookup4(tab2_bytes));
	}

	CPPSPMD_FORCE_INLINE vint spmd_kernel::reverse_bits(vint k, vint tab1, vint tab2)
	{
	vint r0 = table_lookup4_8(k & 0x7F7F7F7F, tab2);
	vint r1 = table_lookup4_8(VUINT_SHIFT_RIGHT(k, 4) & 0x7F7F7F7F, tab1);
	vint r3 = r0 \| r1;
	return byteswap(r3);
	}

	CPPSPMD_FORCE_INLINE vint spmd_kernel::count_leading_zeros(vint x)
	{
	CPPSPMD_DECL(const uint8_t, s_tab[16]) = { 0, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };

	vint tab = init_lookup4(s_tab);

	//x <= 0x0000ffff
	vbool c0 = (x & 0xFFFF0000) == 0;
	vint n0 = spmd_ternaryi(c0, 16, 0);
	vint x0 = spmd_ternaryi(c0, VINT_SHIFT_LEFT(x, 16), x);

	//x <= 0x00ffffff
	vbool c1 = (x0 & 0xFF000000) == 0;
	vint n1 = spmd_ternaryi(c1, n0 + 8, n0);
	vint x1 = spmd_ternaryi(c1, VINT_SHIFT_LEFT(x0, 8), x0);

	//x <= 0x0fffffff
	vbool c2 = (x1 & 0xF0000000) == 0;
	vint n2 = spmd_ternaryi(c2, n1 + 4, n1);
	vint x2 = spmd_ternaryi(c2, VINT_SHIFT_LEFT(x1, 4), x1);

	return table_lookup4_8(VUINT_SHIFT_RIGHT(x2, 28), tab) + n2;
	}

	CPPSPMD_FORCE_INLINE vint spmd_kernel::count_leading_zeros_alt(vint x)
	{
	//x <= 0x0000ffff
	vbool c0 = (x & 0xFFFF0000) == 0;
	vint n0 = spmd_ternaryi(c0, 16, 0);
	vint x0 = spmd_ternaryi(c0, VINT_SHIFT_LEFT(x, 16), x);

	//x <= 0x00ffffff
	vbool c1 = (x0 & 0xFF000000) == 0;
	vint n1 = spmd_ternaryi(c1, n0 + 8, n0);
	vint x1 = spmd_ternaryi(c1, VINT_SHIFT_LEFT(x0, 8), x0);

	//x <= 0x0fffffff
	vbool c2 = (x1 & 0xF0000000) == 0;
	vint n2 = spmd_ternaryi(c2, n1 + 4, n1);
	vint x2 = spmd_ternaryi(c2, VINT_SHIFT_LEFT(x1, 4), x1);

	// x <= 0x3fffffff
	vbool c3 = (x2 & 0xC0000000) == 0;
	vint n3 = spmd_ternaryi(c3, n2 + 2, n2);
	vint x3 = spmd_ternaryi(c3, VINT_SHIFT_LEFT(x2, 2), x2);

	// x <= 0x7fffffff
	vbool c4 = (x3 & 0x80000000) == 0;
	return spmd_ternaryi(c4, n3 + 1, n3);
	}

	CPPSPMD_FORCE_INLINE vint spmd_kernel::count_trailing_zeros(vint x)
	{
	// cast the least significant bit in v to a float
	vfloat f = (vfloat)(x & -x);

	// extract exponent and adjust
	return VUINT_SHIFT_RIGHT(cast_vfloat_to_vint(f), 23) - 0x7F;
	}

	CPPSPMD_FORCE_INLINE vint spmd_kernel::count_set_bits(vint x)
	{
	vint v = x - (VUINT_SHIFT_RIGHT(x, 1) & 0x55555555);
	vint v1 = (v & 0x33333333) + (VUINT_SHIFT_RIGHT(v, 2) & 0x33333333);
	return VUINT_SHIFT_RIGHT(((v1 + VUINT_SHIFT_RIGHT(v1, 4) & 0xF0F0F0F) * 0x1010101), 24);
	}

	CPPSPMD_FORCE_INLINE vint cmple_epu16(const vint &a, const vint &b)
	{
	return cmpeq_epi16(subs_epu16(a, b), vint(0));
	}

	CPPSPMD_FORCE_INLINE vint cmpge_epu16(const vint &a, const vint &b)
	{
	return cmple_epu16(b, a);
	}

	CPPSPMD_FORCE_INLINE vint cmpgt_epu16(const vint &a, const vint &b)
	{
	return andnot(cmpeq_epi16(a, b), cmple_epu16(b, a));
	}

	CPPSPMD_FORCE_INLINE vint cmplt_epu16(const vint &a, const vint &b)
	{
	return cmpgt_epu16(b, a);
	}

	CPPSPMD_FORCE_INLINE vint cmpge_epi16(const vint &a, const vint &b)
	{
	return cmpeq_epi16(a, b) \| cmpgt_epi16(a, b);
	}

	CPPSPMD_FORCE_INLINE vint cmple_epi16(const vint &a, const vint &b)
	{
	return cmpge_epi16(b, a);
	}

	void spmd_kernel::print_vint(vint v)
	{
	for (uint32_t i = 0; i < PROGRAM_COUNT; i++)
	printf("%i ", extract(v, i));
	printf("\n");
	}

	void spmd_kernel::print_vbool(vbool v)
	{
	for (uint32_t i = 0; i < PROGRAM_COUNT; i++)
	printf("%i ", extract(v, i) ? 1 : 0);
	printf("\n");
	}

	void spmd_kernel::print_vint_hex(vint v)
	{
	for (uint32_t i = 0; i < PROGRAM_COUNT; i++)
	printf("0x%X ", extract(v, i));
	printf("\n");
	}

	void spmd_kernel::print_active_lanes(const char *pPrefix)
	{
	CPPSPMD_DECL(int, flags[PROGRAM_COUNT]);
	memset(flags, 0, sizeof(flags));
	storeu_linear(flags, vint(1));

	if (pPrefix)
	printf("%s", pPrefix);

	for (uint32_t i = 0; i < PROGRAM_COUNT; i++)
	{
	if (flags[i])
	printf("%u ", i);
	}
	printf("\n");
	}

	void spmd_kernel::print_vfloat(vfloat v)
	{
	for (uint32_t i = 0; i < PROGRAM_COUNT; i++)
	printf("%f ", extract(v, i));
	printf("\n");
	}