Reland "Use tailcall recursion to process skcms ops."

This reverts commit e0a502ae9dab9b659018cbc07d98dda20a305cec.

Reason for revert: experiment complete - https://screenshot.googleplex.com/3jGGaL8NX3kthzn

Original change's description:
> Revert "Use tailcall recursion to process skcms ops."
>
> This reverts commit 7cf575c421865b23b898e7a1b9cbcd3f6ecc63da.
>
> Reason for revert: This is a temporary revert to test for performance deltas on Pinpoint. Barring unforeseen circumstances, this CL will be relanded shortly.
>
> Original change's description:
> > Use tailcall recursion to process skcms ops.
> >
> > This is dramatically faster on M1 processors (and likely other ARMs)
> > compared to a giant switch. It is also 25% faster on AMD EPYC (hsw).
> > The MSVS + clang-cl configuration fails tests when SKCMS_MUSTTAIL is
> > used, so SKCMS_MUSTTAIL is disabled in this configuration; I don't
> > have access to a Windows machine to check the difference in actual
> > code generation, but hopefully this is a negligible difference.
> >
> > Change-Id: Ie2f9569d19f3804fcf3ba77b6148fe8f5c8e13f9
> > Bug: b/305974160
> > Reviewed-on: https://skia-review.googlesource.com/c/skcms/+/771479
> > Commit-Queue: John Stiles <johnstiles@google.com>
> > Reviewed-by: Brian Osman <brianosman@google.com>
>
> Bug: b/305974160
> Change-Id: I03728c1e9be80c4e8d5a0ae808fb2c977420efab
> Reviewed-on: https://skia-review.googlesource.com/c/skcms/+/784541
> Reviewed-by: Brian Osman <brianosman@google.com>
> Commit-Queue: John Stiles <johnstiles@google.com>
> Auto-Submit: John Stiles <johnstiles@google.com>

Bug: b/305974160
Change-Id: I1a97be6d27ae9ea719c0d8308e3f6b52ab09efa2
Reviewed-on: https://skia-review.googlesource.com/c/skcms/+/784990
Bot-Commit: Rubber Stamper <rubber-stamper@appspot.gserviceaccount.com>
Commit-Queue: John Stiles <johnstiles@google.com>
diff --git a/src/Transform_inl.h b/src/Transform_inl.h
index 5b3fe49..3920cea 100644
--- a/src/Transform_inl.h
+++ b/src/Transform_inl.h
@@ -769,27 +769,40 @@
     template <typename T> operator T*() { return (const T*)fArg; }
 };
 
-#define DECLARE_STAGE(name, arg)                                                                \
-    SI void Exec_##name##_k(arg, const char* src, char* dst, F& r, F& g, F& b, F& a, int i);    \
-                                                                                                \
-    SI void Exec_##name(const void* v, const char* s, char* d, F& r, F& g, F& b, F& a, int i) { \
-        Exec_##name##_k(Ctx{v}, s, d, r, g, b, a, i);                                           \
-    }                                                                                           \
-                                                                                                \
-    SI void Exec_##name##_k(arg,                                                                \
-                            SKCMS_MAYBE_UNUSED const char* src,                                 \
-                            SKCMS_MAYBE_UNUSED char* dst,                                       \
-                            SKCMS_MAYBE_UNUSED F& r,                                            \
-                            SKCMS_MAYBE_UNUSED F& g,                                            \
-                            SKCMS_MAYBE_UNUSED F& b,                                            \
-                            SKCMS_MAYBE_UNUSED F& a,                                            \
+// We can't declare StageFn as a function pointer which takes a pointer to StageFns; that would be
+// a circular dependency. To avoid this, StageFn is wrapped in a `struct StageList` which forward-
+// declare here.
+struct StageList;
+using StageFn = void (*)(StageList stages, const void** ctx, const char* s, char* d,
+                         F r, F g, F b, F a, int i);
+struct StageList {
+    const StageFn* fn;
+};
+
+#define DECLARE_STAGE(name, arg, CALL_NEXT)                                                  \
+    SI void Exec_##name##_k(arg, const char* src, char* dst, F& r, F& g, F& b, F& a, int i); \
+                                                                                             \
+    SI void Exec_##name(StageList list, const void** ctx, const char* s, char* d,            \
+                        F r, F g, F b, F a, int i) {                                         \
+        Exec_##name##_k(Ctx{*ctx}, s, d, r, g, b, a, i);                                     \
+        ++list.fn; ++ctx;                                                                    \
+        CALL_NEXT;                                                                           \
+    }                                                                                        \
+                                                                                             \
+    SI void Exec_##name##_k(arg,                                                             \
+                            SKCMS_MAYBE_UNUSED const char* src,                              \
+                            SKCMS_MAYBE_UNUSED char* dst,                                    \
+                            SKCMS_MAYBE_UNUSED F& r,                                         \
+                            SKCMS_MAYBE_UNUSED F& g,                                         \
+                            SKCMS_MAYBE_UNUSED F& b,                                         \
+                            SKCMS_MAYBE_UNUSED F& a,                                         \
                             SKCMS_MAYBE_UNUSED int i)
 
 #define STAGE(name, arg) \
-    DECLARE_STAGE(name, arg)
+    DECLARE_STAGE(name, arg, SKCMS_MUSTTAIL return (*list.fn)(list, ctx, s, d, r, g, b, a, i))
 
 #define FINAL_STAGE(name, arg) \
-    DECLARE_STAGE(name, arg)
+    DECLARE_STAGE(name, arg, /*just return to exec_stages*/)
 
 STAGE(load_a8, NoCtx) {
     a = F_from_U8(load<U8>(src + 1*i));
@@ -1195,7 +1208,7 @@
     clut(b2a, &r,&g,&b,&a);
 }
 
-// From here on down, the store_ ops are all "final stages," ending the loop.
+// From here on down, the store_ ops are all "final stages," terminating the tail-call recursion.
 
 FINAL_STAGE(store_a8, NoCtx) {
     store(dst + 1*i, cast<U8>(to_fixed(a * 255)));
@@ -1433,29 +1446,31 @@
 #endif
 }
 
-static void exec_ops(const Op* ops, const void** contexts,
-                     const char* src, char* dst, int i) {
-    F r = F0, g = F0, b = F0, a = F1;
-    while (true) {
-        switch (*ops++) {
-#define M(name) case Op::name: Exec_##name(*contexts++, src, dst, r, g, b, a, i); break;
-            SKCMS_LOAD_OPS(M)
-            SKCMS_WORK_OPS(M)
-#undef M
-#define M(name) case Op::name: Exec_##name(*contexts++, src, dst, r, g, b, a, i); return;
-            SKCMS_STORE_OPS(M)
-#undef M
-        }
-    }
+SI void exec_stages(StageList list, const void** contexts, const char* src, char* dst, int i) {
+    (*list.fn)(list, contexts, src, dst, F0, F0, F0, F1, i);
 }
 
 // NOLINTNEXTLINE(misc-definitions-in-headers)
-void run_program(const Op* program, const void** contexts, ptrdiff_t /*programSize*/,
+void run_program(const Op* program, const void** contexts, ptrdiff_t programSize,
                  const char* src, char* dst, int n,
                  const size_t src_bpp, const size_t dst_bpp) {
+    // Convert the program into an array of tailcall stages.
+    StageFn stages[32];
+    assert(programSize <= ARRAY_COUNT(stages));
+
+    static constexpr StageFn kStageFns[] = {
+#define M(name) &Exec_##name,
+        SKCMS_ALL_OPS(M)
+#undef M
+    };
+
+    for (ptrdiff_t index = 0; index < programSize; ++index) {
+        stages[index] = kStageFns[(int)program[index]];
+    }
+
     int i = 0;
     while (n >= N) {
-        exec_ops(program, contexts, src, dst, i);
+        exec_stages({stages}, contexts, src, dst, i);
         i += N;
         n -= N;
     }
@@ -1463,7 +1478,7 @@
         char tmp[4*4*N] = {0};
 
         memcpy(tmp, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp);
-        exec_ops(program, contexts, tmp, tmp, 0);
+        exec_stages({stages}, contexts, tmp, tmp, 0);
         memcpy((char*)dst + (size_t)i*dst_bpp, tmp, (size_t)n*dst_bpp);
     }
 }
diff --git a/src/skcms_Transform.h b/src/skcms_Transform.h
index 97413f4..51391a0 100644
--- a/src/skcms_Transform.h
+++ b/src/skcms_Transform.h
@@ -17,7 +17,7 @@
 
 /** All transform ops */
 
-#define SKCMS_LOAD_OPS(M) \
+#define SKCMS_ALL_OPS(M) \
     M(load_a8)            \
     M(load_g8)            \
     M(load_4444)          \
@@ -33,9 +33,7 @@
     M(load_hhh)           \
     M(load_hhhh)          \
     M(load_fff)           \
-    M(load_ffff)
-
-#define SKCMS_WORK_OPS(M) \
+    M(load_ffff)          \
     M(swap_rb)            \
     M(clamp)              \
     M(invert)             \
@@ -84,31 +82,27 @@
     M(table_a)            \
                           \
     M(clut_A2B)           \
-    M(clut_B2A)
-
-#define SKCMS_STORE_OPS(M) \
-    M(store_a8)            \
-    M(store_g8)            \
-    M(store_4444)          \
-    M(store_565)           \
-    M(store_888)           \
-    M(store_8888)          \
-    M(store_1010102)       \
-    M(store_161616LE)      \
-    M(store_16161616LE)    \
-    M(store_161616BE)      \
-    M(store_16161616BE)    \
-    M(store_101010x_XR)    \
-    M(store_hhh)           \
-    M(store_hhhh)          \
-    M(store_fff)           \
+    M(clut_B2A)           \
+    M(store_a8)           \
+    M(store_g8)           \
+    M(store_4444)         \
+    M(store_565)          \
+    M(store_888)          \
+    M(store_8888)         \
+    M(store_1010102)      \
+    M(store_161616LE)     \
+    M(store_16161616LE)   \
+    M(store_161616BE)     \
+    M(store_16161616BE)   \
+    M(store_101010x_XR)   \
+    M(store_hhh)          \
+    M(store_hhhh)         \
+    M(store_fff)          \
     M(store_ffff)
 
 enum class Op : int {
 #define M(op) op,
-    SKCMS_LOAD_OPS(M)
-    SKCMS_WORK_OPS(M)
-    SKCMS_STORE_OPS(M)
+    SKCMS_ALL_OPS(M)
 #undef M
 };