arithmetic mode with Sk4f
[Cherry-pick of d58f840650a6768b50d024247e2817ccbacd8a0d to chrome/m50 branch.]
After reading the SSE version, I figured I'd show off the new hotness a little. This'll get us SSE, NEON and portable implementations all in one easy to read package.
Since we've been talking about it, it's worth noting the several ways this implementation is still not constant time:
- short circuits on 0x00 and 0xff coverage;
- floating point multiplication with untrusted k1-k4; if someone figures out a clever way to sometimes create denorm floats and sometimes not, there's a gigantic performance difference.
I would hazard the pin is constant time now though.
I've also fixed the lerp to lerp between dst and r instead of src and r. That can't have been right.
curr/maxrss loops min median mean max stddev samples config bench
9/9 MB 1 25.5ms 25.5ms 25.5ms 25.5ms 0% ▃▁▁▃▂▇▅▆▇█ 8888 Xfermode_arithmetic_enforce_pm_aa
9/9 MB 1 24.1ms 24.2ms 24.2ms 24.3ms 0% ▄▃▁▄█▆▆█▃█ 8888 Xfermode_arithmetic_aa
9/9 MB 1 102ms 102ms 102ms 103ms 0% ▁▅▂▆▂█▂█▁▂ 8888 Xfermode_arithmetic_enforce_pm
9/9 MB 1 94.8ms 95.4ms 95.2ms 95.8ms 0% ▅▅▁▁▁▁▄▇█▇ 8888 Xfermode_arithmetic
~~~~>
curr/maxrss loops min median mean max stddev samples config bench
9/9 MB 1 9.71ms 9.74ms 9.73ms 9.78ms 0% █▅▄▄▁▂▂▂▄▄ 8888 Xfermode_arithmetic_enforce_pm_aa
9/9 MB 1 9.5ms 9.57ms 9.58ms 9.7ms 1% ▂▁█▅▂▂▆▃▄▄ 8888 Xfermode_arithmetic_aa
9/9 MB 1 21.8ms 21.8ms 21.8ms 21.9ms 0% █▂▂▂▂▂▂▁▄▂ 8888 Xfermode_arithmetic_enforce_pm
9/9 MB 1 16.5ms 16.6ms 16.6ms 16.6ms 0% ▃█▁▁▄▄▁▁▆▅ 8888 Xfermode_arithmetic
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1873963003
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review URL: https://codereview.chromium.org/1873963003
NOTREECHECKS=true
NOTRY=true
NOPRESUBMIT=true
TBR=mtklein@google.com
Review URL: https://codereview.chromium.org/1902903002 .
diff --git a/src/effects/SkArithmeticMode.cpp b/src/effects/SkArithmeticMode.cpp
index c8e0756..954f8eb 100644
--- a/src/effects/SkArithmeticMode.cpp
+++ b/src/effects/SkArithmeticMode.cpp
@@ -7,10 +7,11 @@
#include "SkArithmeticMode.h"
#include "SkColorPriv.h"
+#include "SkNx.h"
#include "SkReadBuffer.h"
-#include "SkWriteBuffer.h"
#include "SkString.h"
#include "SkUnPreMultiply.h"
+#include "SkWriteBuffer.h"
#if SK_SUPPORT_GPU
#include "SkArithmeticMode_gpu.h"
#endif
@@ -63,64 +64,37 @@
return SkArithmeticMode::Create(k1, k2, k3, k4, enforcePMColor);
}
-static int pinToByte(int value) {
- if (value < 0) {
- value = 0;
- } else if (value > 255) {
- value = 255;
- }
- return value;
-}
-
-static int arith(SkScalar k1, SkScalar k2, SkScalar k3, SkScalar k4,
- int src, int dst) {
- SkScalar result = SkScalarMul(k1, src * dst) +
- SkScalarMul(k2, src) +
- SkScalarMul(k3, dst) +
- k4;
- int res = SkScalarRoundToInt(result);
- return pinToByte(res);
-}
-
-static int blend(int src, int dst, int scale) {
- return dst + ((src - dst) * scale >> 8);
-}
-
void SkArithmeticMode_scalar::xfer32(SkPMColor dst[], const SkPMColor src[],
int count, const SkAlpha aaCoverage[]) const {
- SkScalar k1 = fK[0] / 255;
- SkScalar k2 = fK[1];
- SkScalar k3 = fK[2];
- SkScalar k4 = fK[3] * 255;
+ const Sk4f k1 = fK[0] * (1/255.0f),
+ k2 = fK[1],
+ k3 = fK[2],
+ k4 = fK[3] * 255.0f + 0.5f;
- for (int i = 0; i < count; ++i) {
- if ((nullptr == aaCoverage) || aaCoverage[i]) {
- SkPMColor sc = src[i];
- SkPMColor dc = dst[i];
+ auto pin = [](float min, const Sk4f& val, float max) {
+ return Sk4f::Max(min, Sk4f::Min(val, max));
+ };
- int a, r, g, b;
-
- a = arith(k1, k2, k3, k4, SkGetPackedA32(sc), SkGetPackedA32(dc));
- r = arith(k1, k2, k3, k4, SkGetPackedR32(sc), SkGetPackedR32(dc));
- g = arith(k1, k2, k3, k4, SkGetPackedG32(sc), SkGetPackedG32(dc));
- b = arith(k1, k2, k3, k4, SkGetPackedB32(sc), SkGetPackedB32(dc));
- if (fEnforcePMColor) {
- r = SkMin32(r, a);
- g = SkMin32(g, a);
- b = SkMin32(b, a);
- }
-
- // apply antialias coverage if necessary
- if (aaCoverage && 0xFF != aaCoverage[i]) {
- int scale = aaCoverage[i] + (aaCoverage[i] >> 7);
- a = blend(a, SkGetPackedA32(sc), scale);
- r = blend(r, SkGetPackedR32(sc), scale);
- g = blend(g, SkGetPackedG32(sc), scale);
- b = blend(b, SkGetPackedB32(sc), scale);
- }
-
- dst[i] = fEnforcePMColor ? SkPackARGB32(a, r, g, b) : SkPackARGB32NoCheck(a, r, g, b);
+ for (int i = 0; i < count; i++) {
+ if (aaCoverage && aaCoverage[i] == 0) {
+ continue;
}
+
+ Sk4f s = SkNx_cast<float>(Sk4b::Load(src+i)),
+ d = SkNx_cast<float>(Sk4b::Load(dst+i)),
+ r = pin(0, k1*s*d + k2*s + k3*d + k4, 255);
+
+ if (fEnforcePMColor) {
+ Sk4f a = SkNx_shuffle<3,3,3,3>(r);
+ r = Sk4f::Min(a, r);
+ }
+
+ if (aaCoverage && aaCoverage[i] != 255) {
+ Sk4f c = aaCoverage[i] * (1/255.0f);
+ r = d + (r-d)*c;
+ }
+
+ SkNx_cast<uint8_t>(r).store(dst+i);
}
}