Reland "Use tailcall recursion to process skcms ops."
This reverts commit e0a502ae9dab9b659018cbc07d98dda20a305cec.
Reason for revert: experiment complete - https://screenshot.googleplex.com/3jGGaL8NX3kthzn
Original change's description:
> Revert "Use tailcall recursion to process skcms ops."
>
> This reverts commit 7cf575c421865b23b898e7a1b9cbcd3f6ecc63da.
>
> Reason for revert: This is a temporary revert to test for performance deltas on Pinpoint. Barring unforeseen circumstances, this CL will be relanded shortly.
>
> Original change's description:
> > Use tailcall recursion to process skcms ops.
> >
> > This is dramatically faster on M1 processors (and likely other ARMs)
> > compared to a giant switch. It is also 25% faster on AMD EPYC (hsw).
> > The MSVS + clang-cl configuration fails tests when SKCMS_MUSTTAIL is
> > used, so SKCMS_MUSTTAIL is disabled in this configuration; I don't
> > have access to a Windows machine to check the difference in actual
> > code generation, but hopefully this is a negligible difference.
> >
> > Change-Id: Ie2f9569d19f3804fcf3ba77b6148fe8f5c8e13f9
> > Bug: b/305974160
> > Reviewed-on: https://skia-review.googlesource.com/c/skcms/+/771479
> > Commit-Queue: John Stiles <johnstiles@google.com>
> > Reviewed-by: Brian Osman <brianosman@google.com>
>
> Bug: b/305974160
> Change-Id: I03728c1e9be80c4e8d5a0ae808fb2c977420efab
> Reviewed-on: https://skia-review.googlesource.com/c/skcms/+/784541
> Reviewed-by: Brian Osman <brianosman@google.com>
> Commit-Queue: John Stiles <johnstiles@google.com>
> Auto-Submit: John Stiles <johnstiles@google.com>
Bug: b/305974160
Change-Id: I1a97be6d27ae9ea719c0d8308e3f6b52ab09efa2
Reviewed-on: https://skia-review.googlesource.com/c/skcms/+/784990
Bot-Commit: Rubber Stamper <rubber-stamper@appspot.gserviceaccount.com>
Commit-Queue: John Stiles <johnstiles@google.com>
diff --git a/src/Transform_inl.h b/src/Transform_inl.h
index 5b3fe49..3920cea 100644
--- a/src/Transform_inl.h
+++ b/src/Transform_inl.h
@@ -769,27 +769,40 @@
template <typename T> operator T*() { return (const T*)fArg; }
};
-#define DECLARE_STAGE(name, arg) \
- SI void Exec_##name##_k(arg, const char* src, char* dst, F& r, F& g, F& b, F& a, int i); \
- \
- SI void Exec_##name(const void* v, const char* s, char* d, F& r, F& g, F& b, F& a, int i) { \
- Exec_##name##_k(Ctx{v}, s, d, r, g, b, a, i); \
- } \
- \
- SI void Exec_##name##_k(arg, \
- SKCMS_MAYBE_UNUSED const char* src, \
- SKCMS_MAYBE_UNUSED char* dst, \
- SKCMS_MAYBE_UNUSED F& r, \
- SKCMS_MAYBE_UNUSED F& g, \
- SKCMS_MAYBE_UNUSED F& b, \
- SKCMS_MAYBE_UNUSED F& a, \
+// We can't declare StageFn as a function pointer which takes a pointer to StageFns; that would be
+// a circular dependency. To avoid this, StageFn is wrapped in a `struct StageList` which forward-
+// declare here.
+struct StageList;
+using StageFn = void (*)(StageList stages, const void** ctx, const char* s, char* d,
+ F r, F g, F b, F a, int i);
+struct StageList {
+ const StageFn* fn;
+};
+
+#define DECLARE_STAGE(name, arg, CALL_NEXT) \
+ SI void Exec_##name##_k(arg, const char* src, char* dst, F& r, F& g, F& b, F& a, int i); \
+ \
+ SI void Exec_##name(StageList list, const void** ctx, const char* s, char* d, \
+ F r, F g, F b, F a, int i) { \
+ Exec_##name##_k(Ctx{*ctx}, s, d, r, g, b, a, i); \
+ ++list.fn; ++ctx; \
+ CALL_NEXT; \
+ } \
+ \
+ SI void Exec_##name##_k(arg, \
+ SKCMS_MAYBE_UNUSED const char* src, \
+ SKCMS_MAYBE_UNUSED char* dst, \
+ SKCMS_MAYBE_UNUSED F& r, \
+ SKCMS_MAYBE_UNUSED F& g, \
+ SKCMS_MAYBE_UNUSED F& b, \
+ SKCMS_MAYBE_UNUSED F& a, \
SKCMS_MAYBE_UNUSED int i)
#define STAGE(name, arg) \
- DECLARE_STAGE(name, arg)
+ DECLARE_STAGE(name, arg, SKCMS_MUSTTAIL return (*list.fn)(list, ctx, s, d, r, g, b, a, i))
#define FINAL_STAGE(name, arg) \
- DECLARE_STAGE(name, arg)
+ DECLARE_STAGE(name, arg, /*just return to exec_stages*/)
STAGE(load_a8, NoCtx) {
a = F_from_U8(load<U8>(src + 1*i));
@@ -1195,7 +1208,7 @@
clut(b2a, &r,&g,&b,&a);
}
-// From here on down, the store_ ops are all "final stages," ending the loop.
+// From here on down, the store_ ops are all "final stages," terminating the tail-call recursion.
FINAL_STAGE(store_a8, NoCtx) {
store(dst + 1*i, cast<U8>(to_fixed(a * 255)));
@@ -1433,29 +1446,31 @@
#endif
}
-static void exec_ops(const Op* ops, const void** contexts,
- const char* src, char* dst, int i) {
- F r = F0, g = F0, b = F0, a = F1;
- while (true) {
- switch (*ops++) {
-#define M(name) case Op::name: Exec_##name(*contexts++, src, dst, r, g, b, a, i); break;
- SKCMS_LOAD_OPS(M)
- SKCMS_WORK_OPS(M)
-#undef M
-#define M(name) case Op::name: Exec_##name(*contexts++, src, dst, r, g, b, a, i); return;
- SKCMS_STORE_OPS(M)
-#undef M
- }
- }
+SI void exec_stages(StageList list, const void** contexts, const char* src, char* dst, int i) {
+ (*list.fn)(list, contexts, src, dst, F0, F0, F0, F1, i);
}
// NOLINTNEXTLINE(misc-definitions-in-headers)
-void run_program(const Op* program, const void** contexts, ptrdiff_t /*programSize*/,
+void run_program(const Op* program, const void** contexts, ptrdiff_t programSize,
const char* src, char* dst, int n,
const size_t src_bpp, const size_t dst_bpp) {
+ // Convert the program into an array of tailcall stages.
+ StageFn stages[32];
+ assert(programSize <= ARRAY_COUNT(stages));
+
+ static constexpr StageFn kStageFns[] = {
+#define M(name) &Exec_##name,
+ SKCMS_ALL_OPS(M)
+#undef M
+ };
+
+ for (ptrdiff_t index = 0; index < programSize; ++index) {
+ stages[index] = kStageFns[(int)program[index]];
+ }
+
int i = 0;
while (n >= N) {
- exec_ops(program, contexts, src, dst, i);
+ exec_stages({stages}, contexts, src, dst, i);
i += N;
n -= N;
}
@@ -1463,7 +1478,7 @@
char tmp[4*4*N] = {0};
memcpy(tmp, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp);
- exec_ops(program, contexts, tmp, tmp, 0);
+ exec_stages({stages}, contexts, tmp, tmp, 0);
memcpy((char*)dst + (size_t)i*dst_bpp, tmp, (size_t)n*dst_bpp);
}
}
diff --git a/src/skcms_Transform.h b/src/skcms_Transform.h
index 97413f4..51391a0 100644
--- a/src/skcms_Transform.h
+++ b/src/skcms_Transform.h
@@ -17,7 +17,7 @@
/** All transform ops */
-#define SKCMS_LOAD_OPS(M) \
+#define SKCMS_ALL_OPS(M) \
M(load_a8) \
M(load_g8) \
M(load_4444) \
@@ -33,9 +33,7 @@
M(load_hhh) \
M(load_hhhh) \
M(load_fff) \
- M(load_ffff)
-
-#define SKCMS_WORK_OPS(M) \
+ M(load_ffff) \
M(swap_rb) \
M(clamp) \
M(invert) \
@@ -84,31 +82,27 @@
M(table_a) \
\
M(clut_A2B) \
- M(clut_B2A)
-
-#define SKCMS_STORE_OPS(M) \
- M(store_a8) \
- M(store_g8) \
- M(store_4444) \
- M(store_565) \
- M(store_888) \
- M(store_8888) \
- M(store_1010102) \
- M(store_161616LE) \
- M(store_16161616LE) \
- M(store_161616BE) \
- M(store_16161616BE) \
- M(store_101010x_XR) \
- M(store_hhh) \
- M(store_hhhh) \
- M(store_fff) \
+ M(clut_B2A) \
+ M(store_a8) \
+ M(store_g8) \
+ M(store_4444) \
+ M(store_565) \
+ M(store_888) \
+ M(store_8888) \
+ M(store_1010102) \
+ M(store_161616LE) \
+ M(store_16161616LE) \
+ M(store_161616BE) \
+ M(store_16161616BE) \
+ M(store_101010x_XR) \
+ M(store_hhh) \
+ M(store_hhhh) \
+ M(store_fff) \
M(store_ffff)
enum class Op : int {
#define M(op) op,
- SKCMS_LOAD_OPS(M)
- SKCMS_WORK_OPS(M)
- SKCMS_STORE_OPS(M)
+ SKCMS_ALL_OPS(M)
#undef M
};