iterative clut()
Code size drops compared to head, and performance is somewhere between
a little better and seriously better, depending on the platform.
Change-Id: I369eaee83d32162f6dcf69efaa2b39c84e17804e
Reviewed-on: https://skia-review.googlesource.com/c/162320
Commit-Queue: Brian Osman <brianosman@google.com>
Auto-Submit: Mike Klein <mtklein@google.com>
Reviewed-by: Brian Osman <brianosman@google.com>
diff --git a/src/Transform_inl.h b/src/Transform_inl.h
index faaa372..5cb8dee 100644
--- a/src/Transform_inl.h
+++ b/src/Transform_inl.h
@@ -548,33 +548,64 @@
template <int kBitDepth>
MAYBE_NOINLINE
-static void clut(const skcms_A2B* a2b, int dim, I32 ix, int stride, F* r, F* g, F* b, F a) {
+static void clut(const skcms_A2B* a2b, int dim, F* r, F* g, F* b, F a) {
assert (0 < dim && dim <= 4);
- int limit = a2b->grid_points[dim-1];
+ // Each of these arrays is really foo[dim], but we use foo[4] since we know dim <= 4.
+ I32 lo[4], // Lower bound index contribution for each dimension.
+ hi[4]; // Upper bound index contribution for each dimension.
+ F t[4]; // Weight for upper bound pixel; lower gets 1-t.
- const F* srcs[] = { r,g,b,&a };
- F src = *srcs[dim-1];
+ // O(dim) work first: calculate lo,hi,t, from r,g,b,a.
+ const F inputs[] = { *r,*g,*b,a };
+ for (int i = dim-1, stride = 1; i >= 0; i--) {
+ { // This block could be done in any order...
+ F x = inputs[i] * (float)(a2b->grid_points[i] - 1);
- F x = src * (float)(limit - 1);
+ lo[i] = cast<I32>( x ); // i.e. trunc(x) == floor(x) here.
+ hi[i] = cast<I32>(minus_1_ulp(x+1.0f));
+ t [i] = x - cast<F>(lo[i]); // i.e. fract(x)
+ }
- I32 lo = cast<I32>( x ),
- hi = cast<I32>(minus_1_ulp(x+1.0f));
- F lr = *r, lg = *g, lb = *b,
- hr = *r, hg = *g, hb = *b;
-
- if (dim == 1) {
- sample_clut<kBitDepth>(a2b, stride*lo + ix, &lr,&lg,&lb);
- sample_clut<kBitDepth>(a2b, stride*hi + ix, &hr,&hg,&hb);
- } else {
- clut<kBitDepth>(a2b, dim-1, stride*lo + ix, stride*limit, &lr,&lg,&lb,a);
- clut<kBitDepth>(a2b, dim-1, stride*hi + ix, stride*limit, &hr,&hg,&hb,a);
+ { // ... but this block must go back to front to get stride right.
+ lo[i] *= stride;
+ hi[i] *= stride;
+ stride *= a2b->grid_points[i];
+ }
}
- F t = x - cast<F>(lo);
- *r = lr + (hr-lr)*t;
- *g = lg + (hg-lg)*t;
- *b = lb + (hb-lb)*t;
+ // It's sometimes a little faster to accumulate into R,G,B than into *r,*g,*b.
+ F R = F0,
+ G = F0,
+ B = F0;
+
+ // We'll sample 2^dim == 1<<dim table entries per pixel,
+ // in all combinations of low and high in each dimension.
+ for (int combo = 0; combo < (1<<dim); combo++) { // This loop can be done in any order.
+ I32 ix = cast<I32>(F0);
+ F w = F1;
+
+ for (int i = 0; i < dim; i++) { // This loop can be done in any order.
+ if (combo & (1<<i)) { // It's arbitrary whether lo=0,hi=1 or lo=1,hi=0.
+ ix += hi[i];
+ w *= t[i];
+ } else {
+ ix += lo[i];
+ w *= 1-t[i];
+ }
+ }
+
+ F sR,sG,sB;
+ sample_clut<kBitDepth>(a2b,ix, &sR,&sG,&sB);
+
+ R += w*sR;
+ G += w*sG;
+ B += w*sB;
+ }
+
+ *r = R;
+ *g = G;
+ *b = B;
}
static void exec_ops(const Op* ops, const void** args,
@@ -903,44 +934,44 @@
case Op_clut_1D_8:{
const skcms_A2B* a2b = (const skcms_A2B*) *args++;
- clut<8>(a2b, 1, cast<I32>(F0), 1, &r,&g,&b,a);
+ clut<8>(a2b, 1, &r,&g,&b,a);
} break;
case Op_clut_1D_16:{
const skcms_A2B* a2b = (const skcms_A2B*) *args++;
- clut<16>(a2b, 1, cast<I32>(F0), 1, &r,&g,&b,a);
+ clut<16>(a2b, 1, &r,&g,&b,a);
} break;
case Op_clut_2D_8:{
const skcms_A2B* a2b = (const skcms_A2B*) *args++;
- clut<8>(a2b, 2, cast<I32>(F0), 1, &r,&g,&b,a);
+ clut<8>(a2b, 2, &r,&g,&b,a);
} break;
case Op_clut_2D_16:{
const skcms_A2B* a2b = (const skcms_A2B*) *args++;
- clut<16>(a2b, 2, cast<I32>(F0), 1, &r,&g,&b,a);
+ clut<16>(a2b, 2, &r,&g,&b,a);
} break;
case Op_clut_3D_8:{
const skcms_A2B* a2b = (const skcms_A2B*) *args++;
- clut<8>(a2b, 3, cast<I32>(F0), 1, &r,&g,&b,a);
+ clut<8>(a2b, 3, &r,&g,&b,a);
} break;
case Op_clut_3D_16:{
const skcms_A2B* a2b = (const skcms_A2B*) *args++;
- clut<16>(a2b, 3, cast<I32>(F0), 1, &r,&g,&b,a);
+ clut<16>(a2b, 3, &r,&g,&b,a);
} break;
case Op_clut_4D_8:{
const skcms_A2B* a2b = (const skcms_A2B*) *args++;
- clut<8>(a2b, 4, cast<I32>(F0), 1, &r,&g,&b,a);
+ clut<8>(a2b, 4, &r,&g,&b,a);
// 'a' was really a CMYK K, so our output is actually opaque.
a = F1;
} break;
case Op_clut_4D_16:{
const skcms_A2B* a2b = (const skcms_A2B*) *args++;
- clut<16>(a2b, 4, cast<I32>(F0), 1, &r,&g,&b,a);
+ clut<16>(a2b, 4, &r,&g,&b,a);
// 'a' was really a CMYK K, so our output is actually opaque.
a = F1;
} break;