Revert "Use tailcall recursion to process skcms ops." This reverts commit 7cf575c421865b23b898e7a1b9cbcd3f6ecc63da. Reason for revert: This is a temporary revert to test for performance deltas on Pinpoint. Barring unforeseen circumstances, this CL will be relanded shortly. Original change's description: > Use tailcall recursion to process skcms ops. > > This is dramatically faster on M1 processors (and likely other ARMs) > compared to a giant switch. It is also 25% faster on AMD EPYC (hsw). > The MSVS + clang-cl configuration fails tests when SKCMS_MUSTTAIL is > used, so SKCMS_MUSTTAIL is disabled in this configuration; I don't > have access to a Windows machine to check the difference in actual > code generation, but hopefully this is a negligible difference. > > Change-Id: Ie2f9569d19f3804fcf3ba77b6148fe8f5c8e13f9 > Bug: b/305974160 > Reviewed-on: https://skia-review.googlesource.com/c/skcms/+/771479 > Commit-Queue: John Stiles <johnstiles@google.com> > Reviewed-by: Brian Osman <brianosman@google.com> Bug: b/305974160 Change-Id: I03728c1e9be80c4e8d5a0ae808fb2c977420efab Reviewed-on: https://skia-review.googlesource.com/c/skcms/+/784541 Reviewed-by: Brian Osman <brianosman@google.com> Commit-Queue: John Stiles <johnstiles@google.com> Auto-Submit: John Stiles <johnstiles@google.com>
diff --git a/src/Transform_inl.h b/src/Transform_inl.h index 3920cea..5b3fe49 100644 --- a/src/Transform_inl.h +++ b/src/Transform_inl.h
@@ -769,40 +769,27 @@ template <typename T> operator T*() { return (const T*)fArg; } }; -// We can't declare StageFn as a function pointer which takes a pointer to StageFns; that would be -// a circular dependency. To avoid this, StageFn is wrapped in a `struct StageList` which forward- -// declare here. -struct StageList; -using StageFn = void (*)(StageList stages, const void** ctx, const char* s, char* d, - F r, F g, F b, F a, int i); -struct StageList { - const StageFn* fn; -}; - -#define DECLARE_STAGE(name, arg, CALL_NEXT) \ - SI void Exec_##name##_k(arg, const char* src, char* dst, F& r, F& g, F& b, F& a, int i); \ - \ - SI void Exec_##name(StageList list, const void** ctx, const char* s, char* d, \ - F r, F g, F b, F a, int i) { \ - Exec_##name##_k(Ctx{*ctx}, s, d, r, g, b, a, i); \ - ++list.fn; ++ctx; \ - CALL_NEXT; \ - } \ - \ - SI void Exec_##name##_k(arg, \ - SKCMS_MAYBE_UNUSED const char* src, \ - SKCMS_MAYBE_UNUSED char* dst, \ - SKCMS_MAYBE_UNUSED F& r, \ - SKCMS_MAYBE_UNUSED F& g, \ - SKCMS_MAYBE_UNUSED F& b, \ - SKCMS_MAYBE_UNUSED F& a, \ +#define DECLARE_STAGE(name, arg) \ + SI void Exec_##name##_k(arg, const char* src, char* dst, F& r, F& g, F& b, F& a, int i); \ + \ + SI void Exec_##name(const void* v, const char* s, char* d, F& r, F& g, F& b, F& a, int i) { \ + Exec_##name##_k(Ctx{v}, s, d, r, g, b, a, i); \ + } \ + \ + SI void Exec_##name##_k(arg, \ + SKCMS_MAYBE_UNUSED const char* src, \ + SKCMS_MAYBE_UNUSED char* dst, \ + SKCMS_MAYBE_UNUSED F& r, \ + SKCMS_MAYBE_UNUSED F& g, \ + SKCMS_MAYBE_UNUSED F& b, \ + SKCMS_MAYBE_UNUSED F& a, \ SKCMS_MAYBE_UNUSED int i) #define STAGE(name, arg) \ - DECLARE_STAGE(name, arg, SKCMS_MUSTTAIL return (*list.fn)(list, ctx, s, d, r, g, b, a, i)) + DECLARE_STAGE(name, arg) #define FINAL_STAGE(name, arg) \ - DECLARE_STAGE(name, arg, /*just return to exec_stages*/) + DECLARE_STAGE(name, arg) STAGE(load_a8, NoCtx) { a = F_from_U8(load<U8>(src + 1*i)); @@ -1208,7 +1195,7 @@ clut(b2a, &r,&g,&b,&a); } -// From here on down, the store_ ops are all "final stages," terminating the tail-call recursion. +// From here on down, the store_ ops are all "final stages," ending the loop. FINAL_STAGE(store_a8, NoCtx) { store(dst + 1*i, cast<U8>(to_fixed(a * 255))); @@ -1446,31 +1433,29 @@ #endif } -SI void exec_stages(StageList list, const void** contexts, const char* src, char* dst, int i) { - (*list.fn)(list, contexts, src, dst, F0, F0, F0, F1, i); +static void exec_ops(const Op* ops, const void** contexts, + const char* src, char* dst, int i) { + F r = F0, g = F0, b = F0, a = F1; + while (true) { + switch (*ops++) { +#define M(name) case Op::name: Exec_##name(*contexts++, src, dst, r, g, b, a, i); break; + SKCMS_LOAD_OPS(M) + SKCMS_WORK_OPS(M) +#undef M +#define M(name) case Op::name: Exec_##name(*contexts++, src, dst, r, g, b, a, i); return; + SKCMS_STORE_OPS(M) +#undef M + } + } } // NOLINTNEXTLINE(misc-definitions-in-headers) -void run_program(const Op* program, const void** contexts, ptrdiff_t programSize, +void run_program(const Op* program, const void** contexts, ptrdiff_t /*programSize*/, const char* src, char* dst, int n, const size_t src_bpp, const size_t dst_bpp) { - // Convert the program into an array of tailcall stages. - StageFn stages[32]; - assert(programSize <= ARRAY_COUNT(stages)); - - static constexpr StageFn kStageFns[] = { -#define M(name) &Exec_##name, - SKCMS_ALL_OPS(M) -#undef M - }; - - for (ptrdiff_t index = 0; index < programSize; ++index) { - stages[index] = kStageFns[(int)program[index]]; - } - int i = 0; while (n >= N) { - exec_stages({stages}, contexts, src, dst, i); + exec_ops(program, contexts, src, dst, i); i += N; n -= N; } @@ -1478,7 +1463,7 @@ char tmp[4*4*N] = {0}; memcpy(tmp, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp); - exec_stages({stages}, contexts, tmp, tmp, 0); + exec_ops(program, contexts, tmp, tmp, 0); memcpy((char*)dst + (size_t)i*dst_bpp, tmp, (size_t)n*dst_bpp); } }
diff --git a/src/skcms_Transform.h b/src/skcms_Transform.h index 51391a0..97413f4 100644 --- a/src/skcms_Transform.h +++ b/src/skcms_Transform.h
@@ -17,7 +17,7 @@ /** All transform ops */ -#define SKCMS_ALL_OPS(M) \ +#define SKCMS_LOAD_OPS(M) \ M(load_a8) \ M(load_g8) \ M(load_4444) \ @@ -33,7 +33,9 @@ M(load_hhh) \ M(load_hhhh) \ M(load_fff) \ - M(load_ffff) \ + M(load_ffff) + +#define SKCMS_WORK_OPS(M) \ M(swap_rb) \ M(clamp) \ M(invert) \ @@ -82,27 +84,31 @@ M(table_a) \ \ M(clut_A2B) \ - M(clut_B2A) \ - M(store_a8) \ - M(store_g8) \ - M(store_4444) \ - M(store_565) \ - M(store_888) \ - M(store_8888) \ - M(store_1010102) \ - M(store_161616LE) \ - M(store_16161616LE) \ - M(store_161616BE) \ - M(store_16161616BE) \ - M(store_101010x_XR) \ - M(store_hhh) \ - M(store_hhhh) \ - M(store_fff) \ + M(clut_B2A) + +#define SKCMS_STORE_OPS(M) \ + M(store_a8) \ + M(store_g8) \ + M(store_4444) \ + M(store_565) \ + M(store_888) \ + M(store_8888) \ + M(store_1010102) \ + M(store_161616LE) \ + M(store_16161616LE) \ + M(store_161616BE) \ + M(store_16161616BE) \ + M(store_101010x_XR) \ + M(store_hhh) \ + M(store_hhhh) \ + M(store_fff) \ M(store_ffff) enum class Op : int { #define M(op) op, - SKCMS_ALL_OPS(M) + SKCMS_LOAD_OPS(M) + SKCMS_WORK_OPS(M) + SKCMS_STORE_OPS(M) #undef M };