src/jumper/SkJumper_stages_lowp.cpp - skia - Git at Google

 /*
  * Copyright 2017 Google Inc.
  *
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  */

 #include "SkJumper.h"
 #include "SkJumper_misc.h"
 #include <immintrin.h>

 #if !defined(__SSSE3__) || !defined(__clang__) || !defined(__x86_64__)
     #error "We're starting with just SSSE3 x86-64 for now, and will always require Clang."
 #endif

 #define WRAP(name) sk_##name##_ssse3_lowp

 using K = const SkJumper_constants;
 static const size_t kStride = 8;

 template <typename T> using V = T __attribute__((ext_vector_type(8)));
 using U8  = V<uint8_t>;
 using U16 = V<uint16_t>;
 using U32 = V<uint32_t>;

 // See SkFixed15.h for details on this format and its operations.
 struct F {
     U16 vec;

     F() = default;
     F(float f) {
         // After adding 256.0f, the SkFixed15 value is the bottom two bytes of the float.
         f += 256.0f;
         vec = unaligned_load<uint16_t>(&f);
     }

     F(U16 v) : vec(v) {}
     operator U16() const { return vec; }
 };

 SI F operator+(F x, F y) { return x.vec + y.vec; }
 SI F operator-(F x, F y) { return x.vec - y.vec; }
 SI F operator*(F x, F y) { return _mm_abs_epi16(_mm_mulhrs_epi16(x.vec, y.vec)); }
 SI F mad(F f, F m, F a) { return f*m+a; }
 SI F inv(F v) { return 1.0f - v; }
 SI F two(F v) { return v + v; }
 SI F lerp(F from, F to, F t) { return to*t + from*inv(t); }

 SI F operator<<(F x, int bits) { return x.vec << bits; }
 SI F operator>>(F x, int bits) { return x.vec >> bits; }

 using Stage = void(K* k, void** program, size_t x, size_t y, size_t tail, F,F,F,F, F,F,F,F);

 MAYBE_MSABI
 extern "C" size_t WRAP(start_pipeline)(size_t x, size_t y, size_t limit, void** program, K* k) {
     F v{};
     auto start = (Stage*)load_and_inc(program);
     while (x + kStride <= limit) {
         start(k,program,x,y,0,    v,v,v,v, v,v,v,v);
         x += kStride;
     }
     if (size_t tail = limit - x) {
         start(k,program,x,y,tail, v,v,v,v, v,v,v,v);
     }
     return limit;
 }
 extern "C" void WRAP(just_return)(K*, void**, size_t,size_t,size_t, F,F,F,F, F,F,F,F) {}

 #define STAGE(name)                                                                   \
     SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail,              \
                      F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da);             \
     extern "C" void WRAP(name)(K* k, void** program, size_t x, size_t y, size_t tail, \
                                F r, F g, F b, F a, F dr, F dg, F db, F da) {          \
         LazyCtx ctx(program);                                                         \
         name##_k(k,ctx,x,y,tail, r,g,b,a, dr,dg,db,da);                               \
         auto next = (Stage*)load_and_inc(program);                                    \
         next(k,program,x,y,tail, r,g,b,a, dr,dg,db,da);                               \
     }                                                                                 \
     SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail,              \
                      F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)


 // Helper functions used by multiple stages.

 template <typename V, typename T>
 SI V load(const T* src, size_t tail) {
     __builtin_assume(tail < kStride);
     if (__builtin_expect(tail, 0)) {
         V v{};  // Any inactive lanes are zeroed.
         switch (tail-1) {
             case 6: v[6] = src[6];
             case 5: v[5] = src[5];
             case 4: v[4] = src[4];
             case 3: v[3] = src[3];
             case 2: v[2] = src[2];
             case 1: v[1] = src[1];
             case 0: v[0] = src[0];
         }
         return v;
     }
     return unaligned_load<V>(src);
 }

 template <typename V, typename T>
 SI void store(T* dst, V v, size_t tail) {
     __builtin_assume(tail < kStride);
     if (__builtin_expect(tail, 0)) {
         switch (tail-1) {
             case 6: dst[6] = v[6];
             case 5: dst[5] = v[5];
             case 4: dst[4] = v[4];
             case 3: dst[3] = v[3];
             case 2: dst[2] = v[2];
             case 1: dst[1] = v[1];
             case 0: dst[0] = v[0];
         }
         return;
     }
     unaligned_store(dst, v);
 }

 SI void from_8888(U32 rgba, F* r, F* g, F* b, F* a) {
     // Split the 8 pixels into low and high halves, and reinterpret as vectors of 16-bit values.
     U16 lo = unaligned_load<U16>((const uint32_t*)&rgba + 0),
         hi = unaligned_load<U16>((const uint32_t*)&rgba + 4);

     // Shuffle so that the 4 bytes of each color channel are contiguous...
     lo = _mm_shuffle_epi8(lo, _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15));
     hi = _mm_shuffle_epi8(hi, _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15));

     // ...then get all 8 bytes of each color channel together into a single register.
     U16 rg = _mm_unpacklo_epi32(lo,hi),
         ba = _mm_unpackhi_epi32(lo,hi);

     // Unpack as 16-bit values into the high half of each 16-bit lane, to get a free *256.
     U16 R = _mm_unpacklo_epi8(U16(0), rg),
         G = _mm_unpackhi_epi8(U16(0), rg),
         B = _mm_unpacklo_epi8(U16(0), ba),
         A = _mm_unpackhi_epi8(U16(0), ba);

     // Now we scale from [0,255] to [0,32768].  Ideally that's 32768/255 = 128.50196,
     // but we can approximate that very cheaply as 256*32897/65536 = 128.50391.
     // 0 and 255 map to 0 and 32768 correctly, and nothing else is off by more than 1.
     *r = _mm_mulhi_epu16(R, U16(32897));
     *g = _mm_mulhi_epu16(G, U16(32897));
     *b = _mm_mulhi_epu16(B, U16(32897));
     *a = _mm_mulhi_epu16(A, U16(32897));
 }
 SI F from_byte(U8 bytes) {
     // See from_8888() just above.
     U16 hi = _mm_unpacklo_epi8(U16(0), widen_cast<__m128i>(bytes));
     return (F)_mm_mulhi_epu16(hi, U16(32897));
 }

 SI U32 to_8888(F r, F g, F b, F a) {
     // We want to interlace and pack these values from [0,32768] to [0,255].
     // Luckily the simplest possible thing works great: >>7, then saturate.
     // The 'u' in packus handles the saturation to [0,255] we need.
     U16 rb = _mm_packus_epi16(r>>7,b>>7), // r0 r1 r2 r3 r4 r5 r6 r7 b0 b1 b2 b3 b4 b5 b6 b7
         ga = _mm_packus_epi16(g>>7,a>>7);

     U16 rg = _mm_unpacklo_epi8(rb, ga),   // r0 g0 r1 g1 ...                           r7 g7
         ba = _mm_unpackhi_epi8(rb, ga);   // b0 a0       ...                           b7 a7

     U16 lo = _mm_unpacklo_epi16(rg, ba),  // r0 g0 b0 a0 ...                     r3 g3 b3 a3
         hi = _mm_unpackhi_epi16(rg, ba);  // r4 g4 b4 a4 ...                     r7 g7 b7 a7

     U32 px;
     memcpy((uint32_t*)&px + 0, &lo, sizeof(lo));
     memcpy((uint32_t*)&px + 4, &hi, sizeof(hi));
     return px;
 }
 SI U8 to_byte(F v) {
     // See to_8888() just above.
     U16 packed = _mm_packus_epi16(v>>7, v>>7);  // Doesn't really matter what we pack on top.
     return unaligned_load<U8>(&packed);
 }

 // Stages!

 STAGE(constant_color) {
     // We're converting to fixed point, which lets us play some IEEE representation tricks,
     // replacing a naive *32768 and float->int conversion with a simple float add.
     using F32x4 = float    __attribute__((ext_vector_type(4)));
     using U16x8 = uint16_t __attribute__((ext_vector_type(8)));
     auto bits = (U16x8)(unaligned_load<F32x4>((const float*)ctx) + 256.0f);
     r = (U16)bits[0];
     g = (U16)bits[2];
     b = (U16)bits[4];
     a = (U16)bits[6];
 }

 STAGE(set_rgb) {
     auto rgb = (const float*)ctx;
     r = rgb[0];
     g = rgb[1];
     b = rgb[2];
 }

 STAGE(premul) {
     r = r * a;
     g = g * a;
     b = b * a;
 }

 STAGE(load_8888) {
     auto ptr = *(const uint32_t**)ctx + x;
     from_8888(load<U32>(ptr, tail), &r,&g,&b,&a);
 }
 STAGE(store_8888) {
     auto ptr = *(uint32_t**)ctx + x;
     store(ptr, to_8888(r,g,b,a), tail);
 }

 STAGE(load_a8) {
     auto ptr = *(const uint8_t**)ctx + x;
     r = g = b = 0.0f;
     a = from_byte(load<U8>(ptr, tail));
 }
 STAGE(store_a8) {
     auto ptr = *(uint8_t**)ctx + x;
     store(ptr, to_byte(a), tail);
 }

 STAGE(load_g8) {
     auto ptr = *(const uint8_t**)ctx + x;
     r = g = b = from_byte(load<U8>(ptr, tail));
     a = 1.0f;
 }

 STAGE(srcover_rgba_8888) {
     auto ptr = *(uint32_t**)ctx + x;

     from_8888(load<U32>(ptr, tail), &dr,&dg,&db,&da);

     r = mad(dr, inv(a), r);
     g = mad(dg, inv(a), g);
     b = mad(db, inv(a), b);
     a = mad(da, inv(a), a);

     store(ptr, to_8888(r,g,b,a), tail);
 }

 STAGE(scale_1_float) {
     float c = *(const float*)ctx;

     r = r * c;
     g = g * c;
     b = b * c;
     a = a * c;
 }
 STAGE(scale_u8) {
     auto ptr = *(const uint8_t**)ctx + x;

     U8 scales = load<U8>(ptr, tail);
     F c = from_byte(scales);

     r = r * c;
     g = g * c;
     b = b * c;
     a = a * c;
 }

 STAGE(lerp_1_float) {
     float c = *(const float*)ctx;

     r = lerp(dr, r, c);
     g = lerp(dg, g, c);
     b = lerp(db, b, c);
     a = lerp(da, a, c);
 }
 STAGE(lerp_u8) {
     auto ptr = *(const uint8_t**)ctx + x;

     U8 scales = load<U8>(ptr, tail);
     F c = from_byte(scales);

     r = lerp(dr, r, c);
     g = lerp(dg, g, c);
     b = lerp(db, b, c);
     a = lerp(da, a, c);
 }

 STAGE(swap_rb) {
     auto tmp = r;
     r = b;
     b = tmp;
 }

 STAGE(swap) {
     auto swap = [](F& v, F& dv) {
         auto tmp = v;
         v = dv;
         dv = tmp;
     };
     swap(r, dr);
     swap(g, dg);
     swap(b, db);
     swap(a, da);
 }
 STAGE(move_src_dst) {
     dr = r;
     dg = g;
     db = b;
     da = a;
 }
 STAGE(move_dst_src) {
     r = dr;
     g = dg;
     b = db;
     a = da;
 }

 // Most blend modes apply the same logic to each channel.
 #define BLEND_MODE(name)                       \
     SI F name##_channel(F s, F d, F sa, F da); \
     STAGE(name) {                              \
         r = name##_channel(r,dr,a,da);         \
         g = name##_channel(g,dg,a,da);         \
         b = name##_channel(b,db,a,da);         \
         a = name##_channel(a,da,a,da);         \
     }                                          \
     SI F name##_channel(F s, F d, F sa, F da)

 BLEND_MODE(clear)    { return 0.0f; }
 BLEND_MODE(srcatop)  { return s*da + d*inv(sa); }
 BLEND_MODE(dstatop)  { return d*sa + s*inv(da); }
 BLEND_MODE(srcin)    { return s * da; }
 BLEND_MODE(dstin)    { return d * sa; }
 BLEND_MODE(srcout)   { return s * inv(da); }
 BLEND_MODE(dstout)   { return d * inv(sa); }
 BLEND_MODE(srcover)  { return mad(d, inv(sa), s); }
 BLEND_MODE(dstover)  { return mad(s, inv(da), d); }

 BLEND_MODE(modulate) { return s*d; }
 BLEND_MODE(multiply) { return s*inv(da) + d*inv(sa) + s*d; }
 BLEND_MODE(screen)   { return s + inv(s)*d; }
 BLEND_MODE(xor_)     { return s*inv(da) + d*inv(sa); }

 #undef BLEND_MODE
	/*
	* Copyright 2017 Google Inc.
	*
	* Use of this source code is governed by a BSD-style license that can be
	* found in the LICENSE file.
	*/

	#include "SkJumper.h"
	#include "SkJumper_misc.h"
	#include <immintrin.h>

	#if !defined(__SSSE3__) \|\| !defined(__clang__) \|\| !defined(__x86_64__)
	#error "We're starting with just SSSE3 x86-64 for now, and will always require Clang."
	#endif

	#define WRAP(name) sk_##name##_ssse3_lowp

	using K = const SkJumper_constants;
	static const size_t kStride = 8;

	template <typename T> using V = T __attribute__((ext_vector_type(8)));
	using U8 = V<uint8_t>;
	using U16 = V<uint16_t>;
	using U32 = V<uint32_t>;

	// See SkFixed15.h for details on this format and its operations.
	struct F {
	U16 vec;

	F() = default;
	F(float f) {
	// After adding 256.0f, the SkFixed15 value is the bottom two bytes of the float.
	f += 256.0f;
	vec = unaligned_load<uint16_t>(&f);
	}

	F(U16 v) : vec(v) {}
	operator U16() const { return vec; }
	};

	SI F operator+(F x, F y) { return x.vec + y.vec; }
	SI F operator-(F x, F y) { return x.vec - y.vec; }
	SI F operator*(F x, F y) { return _mm_abs_epi16(_mm_mulhrs_epi16(x.vec, y.vec)); }
	SI F mad(F f, F m, F a) { return f*m+a; }
	SI F inv(F v) { return 1.0f - v; }
	SI F two(F v) { return v + v; }
	SI F lerp(F from, F to, F t) { return tot + frominv(t); }

	SI F operator<<(F x, int bits) { return x.vec << bits; }
	SI F operator>>(F x, int bits) { return x.vec >> bits; }

	using Stage = void(K* k, void** program, size_t x, size_t y, size_t tail, F,F,F,F, F,F,F,F);

	MAYBE_MSABI
	extern "C" size_t WRAP(start_pipeline)(size_t x, size_t y, size_t limit, void** program, K* k) {
	F v{};
	auto start = (Stage*)load_and_inc(program);
	while (x + kStride <= limit) {
	start(k,program,x,y,0, v,v,v,v, v,v,v,v);
	x += kStride;
	}
	if (size_t tail = limit - x) {
	start(k,program,x,y,tail, v,v,v,v, v,v,v,v);
	}
	return limit;
	}
	extern "C" void WRAP(just_return)(K, void*, size_t,size_t,size_t, F,F,F,F, F,F,F,F) {}

	#define STAGE(name) \
	SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail, \
	F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
	extern "C" void WRAP(name)(K* k, void** program, size_t x, size_t y, size_t tail, \
	F r, F g, F b, F a, F dr, F dg, F db, F da) { \
	LazyCtx ctx(program); \
	name##_k(k,ctx,x,y,tail, r,g,b,a, dr,dg,db,da); \
	auto next = (Stage*)load_and_inc(program); \
	next(k,program,x,y,tail, r,g,b,a, dr,dg,db,da); \
	} \
	SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail, \
	F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)


	// Helper functions used by multiple stages.

	template <typename V, typename T>
	SI V load(const T* src, size_t tail) {
	__builtin_assume(tail < kStride);
	if (__builtin_expect(tail, 0)) {
	V v{}; // Any inactive lanes are zeroed.
	switch (tail-1) {
	case 6: v[6] = src[6];
	case 5: v[5] = src[5];
	case 4: v[4] = src[4];
	case 3: v[3] = src[3];
	case 2: v[2] = src[2];
	case 1: v[1] = src[1];
	case 0: v[0] = src[0];
	}
	return v;
	}
	return unaligned_load<V>(src);
	}

	template <typename V, typename T>
	SI void store(T* dst, V v, size_t tail) {
	__builtin_assume(tail < kStride);
	if (__builtin_expect(tail, 0)) {
	switch (tail-1) {
	case 6: dst[6] = v[6];
	case 5: dst[5] = v[5];
	case 4: dst[4] = v[4];
	case 3: dst[3] = v[3];
	case 2: dst[2] = v[2];
	case 1: dst[1] = v[1];
	case 0: dst[0] = v[0];
	}
	return;
	}
	unaligned_store(dst, v);
	}

	SI void from_8888(U32 rgba, F* r, F* g, F* b, F* a) {
	// Split the 8 pixels into low and high halves, and reinterpret as vectors of 16-bit values.
	U16 lo = unaligned_load<U16>((const uint32_t*)&rgba + 0),
	hi = unaligned_load<U16>((const uint32_t*)&rgba + 4);

	// Shuffle so that the 4 bytes of each color channel are contiguous...
	lo = _mm_shuffle_epi8(lo, _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15));
	hi = _mm_shuffle_epi8(hi, _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15));

	// ...then get all 8 bytes of each color channel together into a single register.
	U16 rg = _mm_unpacklo_epi32(lo,hi),
	ba = _mm_unpackhi_epi32(lo,hi);

	// Unpack as 16-bit values into the high half of each 16-bit lane, to get a free *256.
	U16 R = _mm_unpacklo_epi8(U16(0), rg),
	G = _mm_unpackhi_epi8(U16(0), rg),
	B = _mm_unpacklo_epi8(U16(0), ba),
	A = _mm_unpackhi_epi8(U16(0), ba);

	// Now we scale from [0,255] to [0,32768]. Ideally that's 32768/255 = 128.50196,
	// but we can approximate that very cheaply as 256*32897/65536 = 128.50391.
	// 0 and 255 map to 0 and 32768 correctly, and nothing else is off by more than 1.
	*r = _mm_mulhi_epu16(R, U16(32897));
	*g = _mm_mulhi_epu16(G, U16(32897));
	*b = _mm_mulhi_epu16(B, U16(32897));
	*a = _mm_mulhi_epu16(A, U16(32897));
	}
	SI F from_byte(U8 bytes) {
	// See from_8888() just above.
	U16 hi = _mm_unpacklo_epi8(U16(0), widen_cast<__m128i>(bytes));
	return (F)_mm_mulhi_epu16(hi, U16(32897));
	}

	SI U32 to_8888(F r, F g, F b, F a) {
	// We want to interlace and pack these values from [0,32768] to [0,255].
	// Luckily the simplest possible thing works great: >>7, then saturate.
	// The 'u' in packus handles the saturation to [0,255] we need.
	U16 rb = _mm_packus_epi16(r>>7,b>>7), // r0 r1 r2 r3 r4 r5 r6 r7 b0 b1 b2 b3 b4 b5 b6 b7
	ga = _mm_packus_epi16(g>>7,a>>7);

	U16 rg = _mm_unpacklo_epi8(rb, ga), // r0 g0 r1 g1 ... r7 g7
	ba = _mm_unpackhi_epi8(rb, ga); // b0 a0 ... b7 a7

	U16 lo = _mm_unpacklo_epi16(rg, ba), // r0 g0 b0 a0 ... r3 g3 b3 a3
	hi = _mm_unpackhi_epi16(rg, ba); // r4 g4 b4 a4 ... r7 g7 b7 a7

	U32 px;
	memcpy((uint32_t*)&px + 0, &lo, sizeof(lo));
	memcpy((uint32_t*)&px + 4, &hi, sizeof(hi));
	return px;
	}
	SI U8 to_byte(F v) {
	// See to_8888() just above.
	U16 packed = _mm_packus_epi16(v>>7, v>>7); // Doesn't really matter what we pack on top.
	return unaligned_load<U8>(&packed);
	}

	// Stages!

	STAGE(constant_color) {
	// We're converting to fixed point, which lets us play some IEEE representation tricks,
	// replacing a naive *32768 and float->int conversion with a simple float add.
	using F32x4 = float __attribute__((ext_vector_type(4)));
	using U16x8 = uint16_t __attribute__((ext_vector_type(8)));
	auto bits = (U16x8)(unaligned_load<F32x4>((const float*)ctx) + 256.0f);
	r = (U16)bits[0];
	g = (U16)bits[2];
	b = (U16)bits[4];
	a = (U16)bits[6];
	}

	STAGE(set_rgb) {
	auto rgb = (const float*)ctx;
	r = rgb[0];
	g = rgb[1];
	b = rgb[2];
	}

	STAGE(premul) {
	r = r * a;
	g = g * a;
	b = b * a;
	}

	STAGE(load_8888) {
	auto ptr = (const uint32_t*)ctx + x;
	from_8888(load<U32>(ptr, tail), &r,&g,&b,&a);
	}
	STAGE(store_8888) {
	auto ptr = (uint32_t*)ctx + x;
	store(ptr, to_8888(r,g,b,a), tail);
	}

	STAGE(load_a8) {
	auto ptr = (const uint8_t*)ctx + x;
	r = g = b = 0.0f;
	a = from_byte(load<U8>(ptr, tail));
	}
	STAGE(store_a8) {
	auto ptr = (uint8_t*)ctx + x;
	store(ptr, to_byte(a), tail);
	}

	STAGE(load_g8) {
	auto ptr = (const uint8_t*)ctx + x;
	r = g = b = from_byte(load<U8>(ptr, tail));
	a = 1.0f;
	}

	STAGE(srcover_rgba_8888) {
	auto ptr = (uint32_t*)ctx + x;

	from_8888(load<U32>(ptr, tail), &dr,&dg,&db,&da);

	r = mad(dr, inv(a), r);
	g = mad(dg, inv(a), g);
	b = mad(db, inv(a), b);
	a = mad(da, inv(a), a);

	store(ptr, to_8888(r,g,b,a), tail);
	}

	STAGE(scale_1_float) {
	float c = (const float)ctx;

	r = r * c;
	g = g * c;
	b = b * c;
	a = a * c;
	}
	STAGE(scale_u8) {
	auto ptr = (const uint8_t*)ctx + x;

	U8 scales = load<U8>(ptr, tail);
	F c = from_byte(scales);

	r = r * c;
	g = g * c;
	b = b * c;
	a = a * c;
	}

	STAGE(lerp_1_float) {
	float c = (const float)ctx;

	r = lerp(dr, r, c);
	g = lerp(dg, g, c);
	b = lerp(db, b, c);
	a = lerp(da, a, c);
	}
	STAGE(lerp_u8) {
	auto ptr = (const uint8_t*)ctx + x;

	U8 scales = load<U8>(ptr, tail);
	F c = from_byte(scales);

	r = lerp(dr, r, c);
	g = lerp(dg, g, c);
	b = lerp(db, b, c);
	a = lerp(da, a, c);
	}

	STAGE(swap_rb) {
	auto tmp = r;
	r = b;
	b = tmp;
	}

	STAGE(swap) {
	auto swap = [](F& v, F& dv) {
	auto tmp = v;
	v = dv;
	dv = tmp;
	};
	swap(r, dr);
	swap(g, dg);
	swap(b, db);
	swap(a, da);
	}
	STAGE(move_src_dst) {
	dr = r;
	dg = g;
	db = b;
	da = a;
	}
	STAGE(move_dst_src) {
	r = dr;
	g = dg;
	b = db;
	a = da;
	}

	// Most blend modes apply the same logic to each channel.
	#define BLEND_MODE(name) \
	SI F name##_channel(F s, F d, F sa, F da); \
	STAGE(name) { \
	r = name##_channel(r,dr,a,da); \
	g = name##_channel(g,dg,a,da); \
	b = name##_channel(b,db,a,da); \
	a = name##_channel(a,da,a,da); \
	} \
	SI F name##_channel(F s, F d, F sa, F da)

	BLEND_MODE(clear) { return 0.0f; }
	BLEND_MODE(srcatop) { return sda + dinv(sa); }
	BLEND_MODE(dstatop) { return dsa + sinv(da); }
	BLEND_MODE(srcin) { return s * da; }
	BLEND_MODE(dstin) { return d * sa; }
	BLEND_MODE(srcout) { return s * inv(da); }
	BLEND_MODE(dstout) { return d * inv(sa); }
	BLEND_MODE(srcover) { return mad(d, inv(sa), s); }
	BLEND_MODE(dstover) { return mad(s, inv(da), d); }

	BLEND_MODE(modulate) { return s*d; }
	BLEND_MODE(multiply) { return sinv(da) + dinv(sa) + s*d; }
	BLEND_MODE(screen) { return s + inv(s)*d; }
	BLEND_MODE(xor_) { return sinv(da) + dinv(sa); }

	#undef BLEND_MODE