relax CLUT inlining a bit
I decided to take another pass at writing the CLUT code in skcms
as "normal", mostly to get rid of the macros now that we're C++
and can use templates.
While I was at it, I moved the dimension from being a fundamental
property of each clut() function to an argument, and took away any
mention of force inlining or even inlining. In all this cut skcms'
code size by about 30%.
Change-Id: I4db09ce0bd7886358c3f30fbf1fc89d88ec83221
Reviewed-on: https://skia-review.googlesource.com/c/161585
Reviewed-by: Brian Osman <brianosman@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/src/Transform_inl.h b/src/Transform_inl.h
index 27d04dd..6135ddf 100644
--- a/src/Transform_inl.h
+++ b/src/Transform_inl.h
@@ -509,81 +509,62 @@
return l + (h-l)*t;
}
-// Color lookup tables, by input dimension and bit depth.
-SI void clut_0_8(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) {
+template <int>
+static void sample_clut(const skcms_A2B*, I32 ix, F* r, F* g, F* b);
+
+template <> void sample_clut<8>(const skcms_A2B* a2b, I32 ix, F* r, F* g, F* b) {
U32 rgb = gather_24(a2b->grid_8, ix);
*r = cast<F>((rgb >> 0) & 0xff) * (1/255.0f);
*g = cast<F>((rgb >> 8) & 0xff) * (1/255.0f);
*b = cast<F>((rgb >> 16) & 0xff) * (1/255.0f);
-
- (void)a;
- (void)stride;
-}
-SI void clut_0_16(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) {
- #if defined(__arm__)
- // This is up to 2x faster on 32-bit ARM than the #else-case fast path.
- *r = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+0));
- *g = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+1));
- *b = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+2));
- #else
- // This strategy is much faster for 64-bit builds, and fine for 32-bit x86 too.
- U64 rgb;
- gather_48(a2b->grid_16, ix, &rgb);
- rgb = swap_endian_16x4(rgb);
-
- *r = cast<F>((rgb >> 0) & 0xffff) * (1/65535.0f);
- *g = cast<F>((rgb >> 16) & 0xffff) * (1/65535.0f);
- *b = cast<F>((rgb >> 32) & 0xffff) * (1/65535.0f);
- #endif
- (void)a;
- (void)stride;
}
-// __attribute__((always_inline)) hits some pathological case in GCC that makes
-// compilation way too slow for my patience.
-#if defined(__clang__)
- #define MAYBE_SI SI
+template <> void sample_clut<16>(const skcms_A2B* a2b, I32 ix, F* r, F* g, F* b) {
+#if defined(__arm__)
+ // This is up to 2x faster on 32-bit ARM than the #else-case fast path.
+ *r = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+0));
+ *g = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+1));
+ *b = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+2));
#else
- #define MAYBE_SI static inline
-#endif
+ // This strategy is much faster for 64-bit builds, and fine for 32-bit x86 too.
+ U64 rgb;
+ gather_48(a2b->grid_16, ix, &rgb);
+ rgb = swap_endian_16x4(rgb);
-// These are all the same basic approach: handle one dimension, then the rest recursively.
-// We let "I" be the current dimension, and "J" the previous dimension, I-1. "B" is the bit depth.
-#define DEF_CLUT(I,J,B) \
- MAYBE_SI \
- void clut_##I##_##B(const skcms_A2B* a2b, I32 ix, I32 stride, F* r, F* g, F* b, F a) { \
- I32 limit = cast<I32>(F0); \
- limit += a2b->grid_points[I-1]; \
- \
- const F* srcs[] = { r,g,b,&a }; \
- F src = *srcs[I-1]; \
- \
- F x = max_(F0, min_(src, F1)) * cast<F>(limit - 1); \
- \
- I32 lo = cast<I32>( x ), \
- hi = cast<I32>(minus_1_ulp(x+1.0f)); \
- F lr = *r, lg = *g, lb = *b, \
- hr = *r, hg = *g, hb = *b; \
- clut_##J##_##B(a2b, stride*lo + ix, stride*limit, &lr,&lg,&lb,a); \
- clut_##J##_##B(a2b, stride*hi + ix, stride*limit, &hr,&hg,&hb,a); \
- \
- F t = x - cast<F>(lo); \
- *r = lr + (hr-lr)*t; \
- *g = lg + (hg-lg)*t; \
- *b = lb + (hb-lb)*t; \
+ *r = cast<F>((rgb >> 0) & 0xffff) * (1/65535.0f);
+ *g = cast<F>((rgb >> 16) & 0xffff) * (1/65535.0f);
+ *b = cast<F>((rgb >> 32) & 0xffff) * (1/65535.0f);
+#endif
+}
+
+template <int kBitDepth>
+static void clut(const skcms_A2B* a2b, int dim, I32 ix, I32 stride, F* r, F* g, F* b, F a) {
+ if (dim == 0) {
+ sample_clut<kBitDepth>(a2b,ix, r,g,b);
+ return;
}
-DEF_CLUT(1,0,8)
-DEF_CLUT(2,1,8)
-DEF_CLUT(3,2,8)
-DEF_CLUT(4,3,8)
+ I32 limit = cast<I32>(F0);
+ limit += a2b->grid_points[dim-1];
-DEF_CLUT(1,0,16)
-DEF_CLUT(2,1,16)
-DEF_CLUT(3,2,16)
-DEF_CLUT(4,3,16)
+ const F* srcs[] = { r,g,b,&a };
+ F src = *srcs[dim-1];
+ F x = max_(F0, min_(src, F1)) * cast<F>(limit - 1);
+
+ I32 lo = cast<I32>( x ),
+ hi = cast<I32>(minus_1_ulp(x+1.0f));
+ F lr = *r, lg = *g, lb = *b,
+ hr = *r, hg = *g, hb = *b;
+ clut<kBitDepth>(a2b, dim-1, stride*lo + ix, stride*limit, &lr,&lg,&lb,a);
+ clut<kBitDepth>(a2b, dim-1, stride*hi + ix, stride*limit, &hr,&hg,&hb,a);
+
+ F t = x - cast<F>(lo);
+ *r = lr + (hr-lr)*t;
+ *g = lg + (hg-lg)*t;
+ *b = lb + (hb-lb)*t;
+}
static void exec_ops(const Op* ops, const void** args,
const char* src, char* dst, int i) {
@@ -911,44 +892,44 @@
case Op_clut_1D_8:{
const skcms_A2B* a2b = (const skcms_A2B*) *args++;
- clut_1_8(a2b, cast<I32>(F0),cast<I32>(F1), &r,&g,&b,a);
+ clut<8>(a2b, 1, cast<I32>(F0), cast<I32>(F1), &r,&g,&b,a);
} break;
case Op_clut_1D_16:{
const skcms_A2B* a2b = (const skcms_A2B*) *args++;
- clut_1_16(a2b, cast<I32>(F0),cast<I32>(F1), &r,&g,&b,a);
+ clut<16>(a2b, 1, cast<I32>(F0), cast<I32>(F1), &r,&g,&b,a);
} break;
case Op_clut_2D_8:{
const skcms_A2B* a2b = (const skcms_A2B*) *args++;
- clut_2_8(a2b, cast<I32>(F0),cast<I32>(F1), &r,&g,&b,a);
+ clut<8>(a2b, 2, cast<I32>(F0), cast<I32>(F1), &r,&g,&b,a);
} break;
case Op_clut_2D_16:{
const skcms_A2B* a2b = (const skcms_A2B*) *args++;
- clut_2_16(a2b, cast<I32>(F0),cast<I32>(F1), &r,&g,&b,a);
+ clut<16>(a2b, 2, cast<I32>(F0), cast<I32>(F1), &r,&g,&b,a);
} break;
case Op_clut_3D_8:{
const skcms_A2B* a2b = (const skcms_A2B*) *args++;
- clut_3_8(a2b, cast<I32>(F0),cast<I32>(F1), &r,&g,&b,a);
+ clut<8>(a2b, 3, cast<I32>(F0), cast<I32>(F1), &r,&g,&b,a);
} break;
case Op_clut_3D_16:{
const skcms_A2B* a2b = (const skcms_A2B*) *args++;
- clut_3_16(a2b, cast<I32>(F0),cast<I32>(F1), &r,&g,&b,a);
+ clut<16>(a2b, 3, cast<I32>(F0), cast<I32>(F1), &r,&g,&b,a);
} break;
case Op_clut_4D_8:{
const skcms_A2B* a2b = (const skcms_A2B*) *args++;
- clut_4_8(a2b, cast<I32>(F0),cast<I32>(F1), &r,&g,&b,a);
+ clut<8>(a2b, 4, cast<I32>(F0), cast<I32>(F1), &r,&g,&b,a);
// 'a' was really a CMYK K, so our output is actually opaque.
a = F1;
} break;
case Op_clut_4D_16:{
const skcms_A2B* a2b = (const skcms_A2B*) *args++;
- clut_4_16(a2b, cast<I32>(F0),cast<I32>(F1), &r,&g,&b,a);
+ clut<16>(a2b, 4, cast<I32>(F0), cast<I32>(F1), &r,&g,&b,a);
// 'a' was really a CMYK K, so our output is actually opaque.
a = F1;
} break;