B2A part 2, CLUT refactoring

This refactors src/Transform_inl.h to support both A2B and B2A CLUTs.
It should not change A2B's behavior, and B2A is not yet active.  If
anything changes, it ought to just be A2B performance.

Change-Id: I65a2cd43d91bb244e7f6868adb2e7840b20ef0a5
Reviewed-on: https://skia-review.googlesource.com/c/skcms/+/391136
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Brian Osman <brianosman@google.com>
diff --git a/skcms.cc b/skcms.cc
index 46086f1..aabe489 100644
--- a/skcms.cc
+++ b/skcms.cc
@@ -2299,7 +2299,8 @@
     Op_table_b,
     Op_table_a,
 
-    Op_clut,
+    Op_clut_A2B,
+    Op_clut_B2A,
 
     Op_store_a8,
     Op_store_g8,
@@ -2705,8 +2706,8 @@
                         *args++ = oa.arg;
                     }
                 }
-                *ops++ = Op_clamp;
-                *ops++ = Op_clut;
+                *ops++  = Op_clamp;
+                *ops++  = Op_clut_A2B;
                 *args++ = &srcProfile->A2B;
             }
 
diff --git a/src/Transform_inl.h b/src/Transform_inl.h
index 2512fae..8eef6a6 100644
--- a/src/Transform_inl.h
+++ b/src/Transform_inl.h
@@ -661,24 +661,34 @@
     return l + (h-l)*t;
 }
 
-SI void sample_clut_8(const skcms_A2B* a2b, I32 ix, F* r, F* g, F* b) {
-    U32 rgb = gather_24(a2b->grid_8, ix);
+SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b) {
+    U32 rgb = gather_24(grid_8, ix);
 
     *r = cast<F>((rgb >>  0) & 0xff) * (1/255.0f);
     *g = cast<F>((rgb >>  8) & 0xff) * (1/255.0f);
     *b = cast<F>((rgb >> 16) & 0xff) * (1/255.0f);
 }
 
-SI void sample_clut_16(const skcms_A2B* a2b, I32 ix, F* r, F* g, F* b) {
+SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b, F* a) {
+    // TODO: don't forget to optimize gather_32().
+    U32 rgba = gather_32(grid_8, ix);
+
+    *r = cast<F>((rgba >>  0) & 0xff) * (1/255.0f);
+    *g = cast<F>((rgba >>  8) & 0xff) * (1/255.0f);
+    *b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
+    *a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
+}
+
+SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b) {
 #if defined(__arm__)
     // This is up to 2x faster on 32-bit ARM than the #else-case fast path.
-    *r = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+0));
-    *g = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+1));
-    *b = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+2));
+    *r = F_from_U16_BE(gather_16(grid_16, 3*ix+0));
+    *g = F_from_U16_BE(gather_16(grid_16, 3*ix+1));
+    *b = F_from_U16_BE(gather_16(grid_16, 3*ix+2));
 #else
     // This strategy is much faster for 64-bit builds, and fine for 32-bit x86 too.
     U64 rgb;
-    gather_48(a2b->grid_16, ix, &rgb);
+    gather_48(grid_16, ix, &rgb);
     rgb = swap_endian_16x4(rgb);
 
     *r = cast<F>((rgb >>  0) & 0xffff) * (1/65535.0f);
@@ -687,29 +697,32 @@
 #endif
 }
 
-// GCC 7.2.0 hits an internal compiler error with -finline-functions (or -O3)
-// when targeting MIPS 64, i386, or s390x,  I think attempting to inline clut() into exec_ops().
-#if 1 && defined(__GNUC__) && !defined(__clang__) \
-      && (defined(__mips64) || defined(__i386) || defined(__s390x__))
-    #define MAYBE_NOINLINE __attribute__((noinline))
-#else
-    #define MAYBE_NOINLINE
-#endif
+SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b, F* a) {
+    // TODO: gather_64()-based fast path?
+    *r = F_from_U16_BE(gather_16(grid_16, 4*ix+0));
+    *g = F_from_U16_BE(gather_16(grid_16, 4*ix+1));
+    *b = F_from_U16_BE(gather_16(grid_16, 4*ix+2));
+    *a = F_from_U16_BE(gather_16(grid_16, 4*ix+3));
+}
 
-MAYBE_NOINLINE
-static void clut(const skcms_A2B* a2b, F* r, F* g, F* b, F a) {
-    const int dim = (int)a2b->input_channels;
+static void clut(uint32_t input_channels, uint32_t output_channels,
+                 const uint8_t grid_points[4], const uint8_t* grid_8, const uint8_t* grid_16,
+                 F* r, F* g, F* b, F* a) {
+
+    const int dim = (int)input_channels;
     assert (0 < dim && dim <= 4);
+    assert (output_channels == 3 ||
+            output_channels == 4);
 
     // For each of these arrays, think foo[2*dim], but we use foo[8] since we know dim <= 4.
     I32 index [8];  // Index contribution by dimension, first low from 0, then high from 4.
     F   weight[8];  // Weight for each contribution, again first low, then high.
 
     // O(dim) work first: calculate index,weight from r,g,b,a.
-    const F inputs[] = { *r,*g,*b,a };
+    const F inputs[] = { *r,*g,*b,*a };
     for (int i = dim-1, stride = 1; i >= 0; i--) {
         // x is where we logically want to sample the grid in the i-th dimension.
-        F x = inputs[i] * (float)(a2b->grid_points[i] - 1);
+        F x = inputs[i] * (float)(grid_points[i] - 1);
 
         // But we can't index at floats.  lo and hi are the two integer grid points surrounding x.
         I32 lo = cast<I32>(            x      ),   // i.e. trunc(x) == floor(x) here.
@@ -717,7 +730,7 @@
         // Notice how we fold in the accumulated stride across previous dimensions here.
         index[i+0] = lo * stride;
         index[i+4] = hi * stride;
-        stride *= a2b->grid_points[i];
+        stride *= grid_points[i];
 
         // We'll interpolate between those two integer grid points by t.
         F t = x - cast<F>(lo);  // i.e. fract(x)
@@ -726,6 +739,9 @@
     }
 
     *r = *g = *b = F0;
+    if (output_channels == 4) {
+        *a = F0;
+    }
 
     // We'll sample 2^dim == 1<<dim table entries per pixel,
     // in all combinations of low and high in each dimension.
@@ -755,19 +771,32 @@
                     w  *= weight[1 + (combo&2)*2];
         }
 
-        F R,G,B;
-        if (a2b->grid_8) {
-            sample_clut_8 (a2b,ix, &R,&G,&B);
+        F R,G,B,A=F0;
+        if (output_channels == 3) {
+            if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B); }
+            else        { sample_clut_16(grid_16,ix, &R,&G,&B); }
         } else {
-            sample_clut_16(a2b,ix, &R,&G,&B);
+            if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B,&A); }
+            else        { sample_clut_16(grid_16,ix, &R,&G,&B,&A); }
         }
-
         *r += w*R;
         *g += w*G;
         *b += w*B;
+        *a += w*A;
     }
 }
 
+static void clut(const skcms_A2B* a2b, F* r, F* g, F* b, F a) {
+    clut(a2b->input_channels, a2b->output_channels,
+         a2b->grid_points, a2b->grid_8, a2b->grid_16,
+         r,g,b,&a);
+}
+static void clut(const skcms_B2A* b2a, F* r, F* g, F* b, F* a) {
+    clut(b2a->input_channels, b2a->output_channels,
+         b2a->grid_points, b2a->grid_8, b2a->grid_16,
+         r,g,b,a);
+}
+
 static void exec_ops(const Op* ops, const void** args,
                      const char* src, char* dst, int i) {
     F r = F0, g = F0, b = F0, a = F1;
@@ -1175,7 +1204,7 @@
             case Op_table_b: { b = table((const skcms_Curve*)*args++, b); } break;
             case Op_table_a: { a = table((const skcms_Curve*)*args++, a); } break;
 
-            case Op_clut: {
+            case Op_clut_A2B: {
                 const skcms_A2B* a2b = (const skcms_A2B*) *args++;
                 clut(a2b, &r,&g,&b,a);
 
@@ -1185,6 +1214,11 @@
                 }
             } break;
 
+            case Op_clut_B2A: {
+                const skcms_B2A* b2a = (const skcms_B2A*) *args++;
+                clut(b2a, &r,&g,&b,&a);
+            } break;
+
     // Notice, from here on down the store_ ops all return, ending the loop.
 
             case Op_store_a8: {