| /* |
| * Copyright 2017 Google Inc. |
| * |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE file. |
| */ |
| |
| // This restricted SkJumper backend works on 8-bit per channel pixels stored in |
| // 16-bit channels. This is a last attempt to write a performant low-precision |
| // backend with stage definitions that can be shared by x86 and ARM. |
| |
| #include "SkJumper.h" |
| #include "SkJumper_misc.h" |
| |
| #if defined(__clang__) // This file is empty when not compiled by Clang. |
| |
| #if defined(__ARM_NEON) |
| #include <arm_neon.h> |
| #if defined(__arm__) |
| #define ABI __attribute__((pcs("aapcs-vfp"))) |
| #else |
| #define ABI |
| #endif |
| #elif defined(__SSE2__) |
| #include <immintrin.h> |
| #define ABI |
| #else |
| #define ABI |
| #endif |
| |
| #if !defined(JUMPER_IS_OFFLINE) |
| #define WRAP(name) sk_##name##_lowp |
| #elif defined(__AVX2__) |
| #define WRAP(name) sk_##name##_hsw_lowp |
| #elif defined(__SSE4_1__) |
| #define WRAP(name) sk_##name##_sse41_lowp |
| #elif defined(__SSE2__) |
| #define WRAP(name) sk_##name##_sse2_lowp |
| #endif |
| |
| #if defined(__AVX2__) |
| using U8 = uint8_t __attribute__((ext_vector_type(16))); |
| using U16 = uint16_t __attribute__((ext_vector_type(16))); |
| using I16 = int16_t __attribute__((ext_vector_type(16))); |
| using U32 = uint32_t __attribute__((ext_vector_type(16))); |
| #else |
| using U8 = uint8_t __attribute__((ext_vector_type(8))); |
| using U16 = uint16_t __attribute__((ext_vector_type(8))); |
| using I16 = int16_t __attribute__((ext_vector_type(8))); |
| using U32 = uint32_t __attribute__((ext_vector_type(8))); |
| #endif |
| |
| static const size_t N = sizeof(U16) / sizeof(uint16_t); |
| |
| // We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64. |
| using Stage = void (ABI*)(size_t tail, void** program, size_t x, size_t y, |
| U16 r, U16 g, U16 b, U16 a, |
| U16 dr, U16 dg, U16 db, U16 da); |
| |
| MAYBE_MSABI |
| ABI extern "C" void WRAP(start_pipeline)(const size_t x0, |
| const size_t y0, |
| const size_t xlimit, |
| const size_t ylimit, |
| void** program) { |
| auto start = (Stage)load_and_inc(program); |
| for (size_t y = y0; y < ylimit; y++) { |
| size_t x = x0; |
| for (; x + N <= xlimit; x += N) { |
| start( 0,program,x,y, 0,0,0,0, 0,0,0,0); |
| } |
| if (size_t tail = xlimit - x) { |
| start(tail,program,x,y, 0,0,0,0, 0,0,0,0); |
| } |
| } |
| } |
| |
| ABI extern "C" void WRAP(just_return)(size_t,void**,size_t,size_t, |
| U16,U16,U16,U16, U16,U16,U16,U16) {} |
| |
| #define STAGE(name, ...) \ |
| SI void name##_k(__VA_ARGS__, size_t x, size_t y, size_t tail, \ |
| U16& r, U16& g, U16& b, U16& a, \ |
| U16& dr, U16& dg, U16& db, U16& da); \ |
| ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t x, size_t y, \ |
| U16 r, U16 g, U16 b, U16 a, \ |
| U16 dr, U16 dg, U16 db, U16 da) { \ |
| name##_k(Ctx{program}, x,y,tail, r,g,b,a, dr,dg,db,da); \ |
| auto next = (Stage)load_and_inc(program); \ |
| next(tail,program,x,y, r,g,b,a, dr,dg,db,da); \ |
| } \ |
| SI void name##_k(__VA_ARGS__, size_t x, size_t y, size_t tail, \ |
| U16& r, U16& g, U16& b, U16& a, \ |
| U16& dr, U16& dg, U16& db, U16& da) |
| |
| // ~~~~~~ Commonly used helper functions ~~~~~~ // |
| |
| SI U16 div255(U16 v) { |
| #if 0 |
| return (v+127)/255; // The ideal rounding divide by 255. |
| #else |
| return (v+255)/256; // A good approximation of (v+127)/255. |
| #endif |
| } |
| |
| SI U16 inv(U16 v) { return 255-v; } |
| |
| SI U16 if_then_else(I16 c, U16 t, U16 e) { return (t & c) | (e & ~c); } |
| |
| SI U16 max(U16 x, U16 y) { return if_then_else(x < y, y, x); } |
| SI U16 min(U16 x, U16 y) { return if_then_else(x < y, x, y); } |
| SI U16 max(U16 x, U16 y, U16 z) { return max(x, max(y, z)); } |
| SI U16 min(U16 x, U16 y, U16 z) { return min(x, min(y, z)); } |
| |
| SI U16 from_float(float f) { return f * 255.0f + 0.5f; } |
| |
| SI U16 lerp(U16 from, U16 to, U16 t) { return div255( from*inv(t) + to*t ); } |
| |
| template <typename D, typename S> |
| SI D cast(S src) { |
| return __builtin_convertvector(src, D); |
| } |
| |
| template <typename D, typename S> |
| SI void split(S v, D* lo, D* hi) { |
| static_assert(2*sizeof(D) == sizeof(S), ""); |
| memcpy(lo, (const char*)&v + 0*sizeof(D), sizeof(D)); |
| memcpy(hi, (const char*)&v + 1*sizeof(D), sizeof(D)); |
| } |
| template <typename D, typename S> |
| SI D join(S lo, S hi) { |
| static_assert(sizeof(D) == 2*sizeof(S), ""); |
| D v; |
| memcpy((char*)&v + 0*sizeof(S), &lo, sizeof(S)); |
| memcpy((char*)&v + 1*sizeof(S), &hi, sizeof(S)); |
| return v; |
| } |
| |
| // ~~~~~~ Basic / misc. stages ~~~~~~ // |
| |
| STAGE(uniform_color, const SkJumper_UniformColorCtx* c) { |
| r = c->rgba[0]; |
| g = c->rgba[1]; |
| b = c->rgba[2]; |
| a = c->rgba[3]; |
| } |
| STAGE(black_color, Ctx::None) { r = g = b = 0; a = 255; } |
| STAGE(white_color, Ctx::None) { r = g = b = 255; a = 255; } |
| |
| STAGE(set_rgb, const float rgb[3]) { |
| r = from_float(rgb[0]); |
| g = from_float(rgb[1]); |
| b = from_float(rgb[2]); |
| } |
| |
| STAGE(premul, Ctx::None) { |
| r = div255(r * a); |
| g = div255(g * a); |
| b = div255(b * a); |
| } |
| |
| STAGE(swap_rb, Ctx::None) { |
| auto tmp = r; |
| r = b; |
| b = tmp; |
| } |
| |
| STAGE(move_src_dst, Ctx::None) { |
| dr = r; |
| dg = g; |
| db = b; |
| da = a; |
| } |
| |
| STAGE(move_dst_src, Ctx::None) { |
| r = dr; |
| g = dg; |
| b = db; |
| a = da; |
| } |
| |
| STAGE(invert, Ctx::None) { |
| r = inv(r); |
| g = inv(g); |
| b = inv(b); |
| a = inv(a); |
| } |
| |
| // ~~~~~~ Blend modes ~~~~~~ // |
| |
| // The same logic applied to all 4 channels. |
| #define BLEND_MODE(name) \ |
| SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \ |
| STAGE(name, Ctx::None) { \ |
| r = name##_channel(r,dr,a,da); \ |
| g = name##_channel(g,dg,a,da); \ |
| b = name##_channel(b,db,a,da); \ |
| a = name##_channel(a,da,a,da); \ |
| } \ |
| SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da) |
| |
| BLEND_MODE(clear) { return 0; } |
| BLEND_MODE(srcatop) { return div255( s*da + d*inv(sa) ); } |
| BLEND_MODE(dstatop) { return div255( d*sa + s*inv(da) ); } |
| BLEND_MODE(srcin) { return div255( s*da ); } |
| BLEND_MODE(dstin) { return div255( d*sa ); } |
| BLEND_MODE(srcout) { return div255( s*inv(da) ); } |
| BLEND_MODE(dstout) { return div255( d*inv(sa) ); } |
| BLEND_MODE(srcover) { return s + div255( d*inv(sa) ); } |
| BLEND_MODE(dstover) { return d + div255( s*inv(da) ); } |
| BLEND_MODE(modulate) { return div255( s*d ); } |
| BLEND_MODE(multiply) { return div255( s*inv(da) + d*inv(sa) + s*d ); } |
| BLEND_MODE(plus_) { return min(s+d, 255); } |
| BLEND_MODE(screen) { return s + d - div255( s*d ); } |
| BLEND_MODE(xor_) { return div255( s*inv(da) + d*inv(sa) ); } |
| #undef BLEND_MODE |
| |
| // The same logic applied to color, and srcover for alpha. |
| #define BLEND_MODE(name) \ |
| SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \ |
| STAGE(name, Ctx::None) { \ |
| r = name##_channel(r,dr,a,da); \ |
| g = name##_channel(g,dg,a,da); \ |
| b = name##_channel(b,db,a,da); \ |
| a = a + div255( da*inv(a) ); \ |
| } \ |
| SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da) |
| |
| BLEND_MODE(darken) { return s + d - div255( max(s*da, d*sa) ); } |
| BLEND_MODE(lighten) { return s + d - div255( min(s*da, d*sa) ); } |
| BLEND_MODE(difference) { return s + d - 2*div255( min(s*da, d*sa) ); } |
| BLEND_MODE(exclusion) { return s + d - 2*div255( s*d ); } |
| |
| BLEND_MODE(hardlight) { |
| return div255( s*inv(da) + d*inv(sa) + |
| if_then_else(2*s <= sa, 2*s*d, sa*da - 2*(sa-s)*(da-d)) ); |
| } |
| BLEND_MODE(overlay) { |
| return div255( s*inv(da) + d*inv(sa) + |
| if_then_else(2*d <= da, 2*s*d, sa*da - 2*(sa-s)*(da-d)) ); |
| } |
| #undef BLEND_MODE |
| |
| // ~~~~~~ Helpers for interacting with memory ~~~~~~ // |
| |
| template <typename T> |
| SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, size_t x, size_t y) { |
| return (T*)ctx->pixels + y*ctx->stride + x; |
| } |
| |
| template <typename V, typename T> |
| SI V load(const T* ptr, size_t tail) { |
| V v = 0; |
| switch (tail & (N-1)) { |
| case 0: memcpy(&v, ptr, sizeof(v)); break; |
| #if defined(__AVX2__) |
| case 15: v[14] = ptr[14]; |
| case 14: v[13] = ptr[13]; |
| case 13: v[12] = ptr[12]; |
| case 12: memcpy(&v, ptr, 12*sizeof(T)); break; |
| case 11: v[10] = ptr[10]; |
| case 10: v[ 9] = ptr[ 9]; |
| case 9: v[ 8] = ptr[ 8]; |
| case 8: memcpy(&v, ptr, 8*sizeof(T)); break; |
| #endif |
| case 7: v[ 6] = ptr[ 6]; |
| case 6: v[ 5] = ptr[ 5]; |
| case 5: v[ 4] = ptr[ 4]; |
| case 4: memcpy(&v, ptr, 4*sizeof(T)); break; |
| case 3: v[ 2] = ptr[ 2]; |
| case 2: memcpy(&v, ptr, 2*sizeof(T)); break; |
| case 1: v[ 0] = ptr[ 0]; |
| } |
| return v; |
| } |
| template <typename V, typename T> |
| SI void store(T* ptr, size_t tail, V v) { |
| switch (tail & (N-1)) { |
| case 0: memcpy(ptr, &v, sizeof(v)); break; |
| #if defined(__AVX2__) |
| case 15: ptr[14] = v[14]; |
| case 14: ptr[13] = v[13]; |
| case 13: ptr[12] = v[12]; |
| case 12: memcpy(ptr, &v, 12*sizeof(T)); break; |
| case 11: ptr[10] = v[10]; |
| case 10: ptr[ 9] = v[ 9]; |
| case 9: ptr[ 8] = v[ 8]; |
| case 8: memcpy(ptr, &v, 8*sizeof(T)); break; |
| #endif |
| case 7: ptr[ 6] = v[ 6]; |
| case 6: ptr[ 5] = v[ 5]; |
| case 5: ptr[ 4] = v[ 4]; |
| case 4: memcpy(ptr, &v, 4*sizeof(T)); break; |
| case 3: ptr[ 2] = v[ 2]; |
| case 2: memcpy(ptr, &v, 2*sizeof(T)); break; |
| case 1: ptr[ 0] = v[ 0]; |
| } |
| } |
| |
| // ~~~~~~ 32-bit memory loads and stores ~~~~~~ // |
| |
| SI void load_8888(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { |
| #if 1 && defined(__ARM_NEON) |
| uint8x8x4_t rgba; |
| switch (tail & (N-1)) { |
| case 0: rgba = vld4_u8 ((const uint8_t*)(ptr+0) ); break; |
| case 7: rgba = vld4_lane_u8((const uint8_t*)(ptr+6), rgba, 6); |
| case 6: rgba = vld4_lane_u8((const uint8_t*)(ptr+5), rgba, 5); |
| case 5: rgba = vld4_lane_u8((const uint8_t*)(ptr+4), rgba, 4); |
| case 4: rgba = vld4_lane_u8((const uint8_t*)(ptr+3), rgba, 3); |
| case 3: rgba = vld4_lane_u8((const uint8_t*)(ptr+2), rgba, 2); |
| case 2: rgba = vld4_lane_u8((const uint8_t*)(ptr+1), rgba, 1); |
| case 1: rgba = vld4_lane_u8((const uint8_t*)(ptr+0), rgba, 0); |
| } |
| *r = cast<U16>(rgba.val[0]); |
| *g = cast<U16>(rgba.val[1]); |
| *b = cast<U16>(rgba.val[2]); |
| *a = cast<U16>(rgba.val[3]); |
| #elif 1 && defined(__AVX2__) |
| // Load normally. |
| U32 rgba = load<U32>(ptr, tail); |
| |
| // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely. |
| __m256i _01,_23; |
| split(rgba, &_01, &_23); |
| __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20), |
| _13 = _mm256_permute2x128_si256(_01,_23, 0x31); |
| rgba = join<U32>(_02, _13); |
| |
| auto cast_U16 = [](U32 v) -> U16 { |
| __m256i _02,_13; |
| split(v, &_02,&_13); |
| return _mm256_packus_epi32(_02,_13); |
| }; |
| *r = cast_U16(rgba & 65535) & 255; |
| *g = cast_U16(rgba & 65535) >> 8; |
| *b = cast_U16(rgba >> 16) & 255; |
| *a = cast_U16(rgba >> 16) >> 8; |
| #else |
| U32 rgba = load<U32>(ptr, tail); |
| *r = cast<U16>(rgba & 65535) & 255; |
| *g = cast<U16>(rgba & 65535) >> 8; |
| *b = cast<U16>(rgba >> 16) & 255; |
| *a = cast<U16>(rgba >> 16) >> 8; |
| #endif |
| } |
| SI void store_8888(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { |
| #if 1 && defined(__ARM_NEON) |
| uint8x8x4_t rgba = {{ |
| cast<U8>(r), |
| cast<U8>(g), |
| cast<U8>(b), |
| cast<U8>(a), |
| }}; |
| switch (tail & (N-1)) { |
| case 0: vst4_u8 ((uint8_t*)(ptr+0), rgba ); break; |
| case 7: vst4_lane_u8((uint8_t*)(ptr+6), rgba, 6); |
| case 6: vst4_lane_u8((uint8_t*)(ptr+5), rgba, 5); |
| case 5: vst4_lane_u8((uint8_t*)(ptr+4), rgba, 4); |
| case 4: vst4_lane_u8((uint8_t*)(ptr+3), rgba, 3); |
| case 3: vst4_lane_u8((uint8_t*)(ptr+2), rgba, 2); |
| case 2: vst4_lane_u8((uint8_t*)(ptr+1), rgba, 1); |
| case 1: vst4_lane_u8((uint8_t*)(ptr+0), rgba, 0); |
| } |
| #else |
| store(ptr, tail, cast<U32>(r | (g<<8)) << 0 |
| | cast<U32>(b | (a<<8)) << 16); |
| #endif |
| } |
| |
| STAGE(load_8888, const SkJumper_MemoryCtx* ctx) { |
| load_8888(ptr_at_xy<const uint32_t>(ctx, x,y), tail, &r,&g,&b,&a); |
| } |
| STAGE(load_8888_dst, const SkJumper_MemoryCtx* ctx) { |
| load_8888(ptr_at_xy<const uint32_t>(ctx, x,y), tail, &dr,&dg,&db,&da); |
| } |
| STAGE(store_8888, const SkJumper_MemoryCtx* ctx) { |
| store_8888(ptr_at_xy<uint32_t>(ctx, x,y), tail, r,g,b,a); |
| } |
| |
| STAGE(load_bgra, const SkJumper_MemoryCtx* ctx) { |
| load_8888(ptr_at_xy<const uint32_t>(ctx, x,y), tail, &b,&g,&r,&a); |
| } |
| STAGE(load_bgra_dst, const SkJumper_MemoryCtx* ctx) { |
| load_8888(ptr_at_xy<const uint32_t>(ctx, x,y), tail, &db,&dg,&dr,&da); |
| } |
| STAGE(store_bgra, const SkJumper_MemoryCtx* ctx) { |
| store_8888(ptr_at_xy<uint32_t>(ctx, x,y), tail, b,g,r,a); |
| } |
| |
| // ~~~~~~ 16-bit memory loads and stores ~~~~~~ // |
| |
| SI void load_565(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { |
| // Format for 565 buffers: 15|rrrrr gggggg bbbbb|0 |
| U16 rgb = load<U16>(ptr, tail); |
| U16 R = (rgb >> 11) & 31, |
| G = (rgb >> 5) & 63, |
| B = (rgb >> 0) & 31; |
| |
| // These bit replications are the same as multiplying by 255/31 or 255/63 to scale to 8-bit. |
| *r = (R << 3) | (R >> 2); |
| *g = (G << 2) | (G >> 4); |
| *b = (B << 3) | (B >> 2); |
| } |
| SI void store_565(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) { |
| // Select the top 5,6,5 bits. |
| U16 R = r >> 3, |
| G = g >> 2, |
| B = b >> 3; |
| // Pack them back into 15|rrrrr gggggg bbbbb|0. |
| store(ptr, tail, R << 11 |
| | G << 5 |
| | B << 0); |
| } |
| |
| STAGE(load_565, const SkJumper_MemoryCtx* ctx) { |
| load_565(ptr_at_xy<const uint16_t>(ctx, x,y), tail, &r,&g,&b); |
| a = 255; |
| } |
| STAGE(load_565_dst, const SkJumper_MemoryCtx* ctx) { |
| load_565(ptr_at_xy<const uint16_t>(ctx, x,y), tail, &dr,&dg,&db); |
| da = 255; |
| } |
| STAGE(store_565, const SkJumper_MemoryCtx* ctx) { |
| store_565(ptr_at_xy<uint16_t>(ctx, x,y), tail, r,g,b); |
| } |
| |
| // ~~~~~~ 8-bit memory loads and stores ~~~~~~ // |
| |
| SI U16 load_8(const uint8_t* ptr, size_t tail) { |
| return cast<U16>(load<U8>(ptr, tail)); |
| } |
| SI void store_8(uint8_t* ptr, size_t tail, U16 v) { |
| store(ptr, tail, cast<U8>(v)); |
| } |
| |
| STAGE(load_a8, const SkJumper_MemoryCtx* ctx) { |
| r = g = b = 0; |
| a = load_8(ptr_at_xy<const uint8_t>(ctx, x,y), tail); |
| } |
| STAGE(load_a8_dst, const SkJumper_MemoryCtx* ctx) { |
| dr = dg = db = 0; |
| da = load_8(ptr_at_xy<const uint8_t>(ctx, x,y), tail); |
| } |
| STAGE(store_a8, const SkJumper_MemoryCtx* ctx) { |
| store_8(ptr_at_xy<uint8_t>(ctx, x,y), tail, a); |
| } |
| |
| STAGE(load_g8, const SkJumper_MemoryCtx* ctx) { |
| r = g = b = load_8(ptr_at_xy<const uint8_t>(ctx, x,y), tail); |
| a = 255; |
| } |
| STAGE(load_g8_dst, const SkJumper_MemoryCtx* ctx) { |
| dr = dg = db = load_8(ptr_at_xy<const uint8_t>(ctx, x,y), tail); |
| da = 255; |
| } |
| STAGE(luminance_to_alpha, Ctx::None) { |
| a = (r*54 + g*183 + b*19)/256; // 0.2126, 0.7152, 0.0722 with 256 denominator. |
| r = g = b = 0; |
| } |
| |
| // ~~~~~~ Coverage scales / lerps ~~~~~~ // |
| |
| STAGE(scale_1_float, const float* f) { |
| U16 c = from_float(*f); |
| r = div255( r * c ); |
| g = div255( g * c ); |
| b = div255( b * c ); |
| a = div255( a * c ); |
| } |
| STAGE(lerp_1_float, const float* f) { |
| U16 c = from_float(*f); |
| r = lerp(dr, r, c); |
| g = lerp(dg, g, c); |
| b = lerp(db, b, c); |
| a = lerp(da, a, c); |
| } |
| |
| STAGE(scale_u8, const SkJumper_MemoryCtx* ctx) { |
| U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, x,y), tail); |
| r = div255( r * c ); |
| g = div255( g * c ); |
| b = div255( b * c ); |
| a = div255( a * c ); |
| } |
| STAGE(lerp_u8, const SkJumper_MemoryCtx* ctx) { |
| U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, x,y), tail); |
| r = lerp(dr, r, c); |
| g = lerp(dg, g, c); |
| b = lerp(db, b, c); |
| a = lerp(da, a, c); |
| } |
| |
| // Derive alpha's coverage from rgb coverage and the values of src and dst alpha. |
| SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) { |
| return if_then_else(a < da, min(cr,cg,cb) |
| , max(cr,cg,cb)); |
| } |
| STAGE(scale_565, const SkJumper_MemoryCtx* ctx) { |
| U16 cr,cg,cb; |
| load_565(ptr_at_xy<const uint16_t>(ctx, x,y), tail, &cr,&cg,&cb); |
| U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); |
| |
| r = div255( r * cr ); |
| g = div255( g * cg ); |
| b = div255( b * cb ); |
| a = div255( a * ca ); |
| } |
| STAGE(lerp_565, const SkJumper_MemoryCtx* ctx) { |
| U16 cr,cg,cb; |
| load_565(ptr_at_xy<const uint16_t>(ctx, x,y), tail, &cr,&cg,&cb); |
| U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); |
| |
| r = lerp(dr, r, cr); |
| g = lerp(dg, g, cg); |
| b = lerp(db, b, cb); |
| a = lerp(da, a, ca); |
| } |
| |
| // ~~~~~~ Compound stages ~~~~~~ // |
| |
| STAGE(srcover_rgba_8888, const SkJumper_MemoryCtx* ctx) { |
| auto ptr = ptr_at_xy<uint32_t>(ctx, x,y); |
| |
| load_8888(ptr, tail, &dr,&dg,&db,&da); |
| r = r + div255( dr*inv(a) ); |
| g = g + div255( dg*inv(a) ); |
| b = b + div255( db*inv(a) ); |
| a = a + div255( da*inv(a) ); |
| store_8888(ptr, tail, r,g,b,a); |
| } |
| |
| #endif//defined(__clang__) |