NEON optimizations for gray -> RGBA (or BGRA) conversions
Swizzle Bench Runtime
Nexus 6P 0.32x
Nexus 9 0.89x
PNG Decode Time (for test set of gray encoded PNGs)
Nexus 6P 0.88x
Nexus 9 0.91x
BUG=skia:4767
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1656383002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review URL: https://codereview.chromium.org/1656383002
diff --git a/bench/SwizzleBench.cpp b/bench/SwizzleBench.cpp
index c78f2c9..0f85b59 100644
--- a/bench/SwizzleBench.cpp
+++ b/bench/SwizzleBench.cpp
@@ -32,3 +32,4 @@
DEF_BENCH(return new SwizzleBench("SkOpts::RGBA_to_BGRA", SkOpts::RGBA_to_BGRA));
DEF_BENCH(return new SwizzleBench("SkOpts::RGB_to_RGB1", SkOpts::RGB_to_RGB1));
DEF_BENCH(return new SwizzleBench("SkOpts::RGB_to_BGR1", SkOpts::RGB_to_BGR1));
+DEF_BENCH(return new SwizzleBench("SkOpts::gray_to_RGB1", SkOpts::gray_to_RGB1));
diff --git a/src/codec/SkSwizzler.cpp b/src/codec/SkSwizzler.cpp
index 7865184..fa93a6e 100644
--- a/src/codec/SkSwizzler.cpp
+++ b/src/codec/SkSwizzler.cpp
@@ -270,6 +270,19 @@
}
}
+static void fast_swizzle_gray_to_n32(
+ void* dst, const uint8_t* src, int width, int bpp, int deltaSrc, int offset,
+ const SkPMColor ctable[]) {
+
+ // This function must not be called if we are sampling. If we are not
+ // sampling, deltaSrc should equal bpp.
+ SkASSERT(deltaSrc == bpp);
+
+ // Note that there is no need to distinguish between RGB and BGR.
+ // Each color channel will get the same value.
+ SkOpts::gray_to_RGB1((uint32_t*) dst, src + offset, width);
+}
+
static void swizzle_gray_to_565(
void* SK_RESTRICT dstRow, const uint8_t* SK_RESTRICT src, int dstWidth,
int bytesPerPixel, int deltaSrc, int offset, const SkPMColor ctable[]) {
@@ -639,6 +652,7 @@
switch (dstInfo.colorType()) {
case kN32_SkColorType:
proc = &swizzle_gray_to_n32;
+ fastProc = &fast_swizzle_gray_to_n32;
break;
case kGray_8_SkColorType:
proc = &sample1;
diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp
index 669401b..bce6ee1 100644
--- a/src/core/SkOpts.cpp
+++ b/src/core/SkOpts.cpp
@@ -84,6 +84,7 @@
decltype(RGBA_to_bgrA) RGBA_to_bgrA = sk_default::RGBA_to_bgrA;
decltype(RGB_to_RGB1) RGB_to_RGB1 = sk_default::RGB_to_RGB1;
decltype(RGB_to_BGR1) RGB_to_BGR1 = sk_default::RGB_to_BGR1;
+ decltype(gray_to_RGB1) gray_to_RGB1 = sk_default::gray_to_RGB1;
// Each Init_foo() is defined in src/opts/SkOpts_foo.cpp.
void Init_ssse3();
diff --git a/src/core/SkOpts.h b/src/core/SkOpts.h
index 41ad8eb..b5286e4 100644
--- a/src/core/SkOpts.h
+++ b/src/core/SkOpts.h
@@ -61,7 +61,8 @@
RGBA_to_rgbA, // i.e. just premultiply
RGBA_to_bgrA, // i.e. swap RB and premultiply
RGB_to_RGB1, // i.e. insert an opaque alpha
- RGB_to_BGR1; // i.e. swap RB and insert an opaque alpha
+ RGB_to_BGR1, // i.e. swap RB and insert an opaque alpha
+ gray_to_RGB1; // i.e. set color channels to same value + an opaque alpha
}
#endif//SkOpts_DEFINED
diff --git a/src/opts/SkOpts_neon.cpp b/src/opts/SkOpts_neon.cpp
index dcb057e..79d3140 100644
--- a/src/opts/SkOpts_neon.cpp
+++ b/src/opts/SkOpts_neon.cpp
@@ -52,5 +52,6 @@
RGBA_to_bgrA = sk_neon::RGBA_to_bgrA;
RGB_to_RGB1 = sk_neon::RGB_to_RGB1;
RGB_to_BGR1 = sk_neon::RGB_to_BGR1;
+ gray_to_RGB1 = sk_neon::gray_to_RGB1;
}
}
diff --git a/src/opts/SkOpts_ssse3.cpp b/src/opts/SkOpts_ssse3.cpp
index 23fdffb..22eda58 100644
--- a/src/opts/SkOpts_ssse3.cpp
+++ b/src/opts/SkOpts_ssse3.cpp
@@ -23,5 +23,6 @@
RGBA_to_bgrA = sk_ssse3::RGBA_to_bgrA;
RGB_to_RGB1 = sk_ssse3::RGB_to_RGB1;
RGB_to_BGR1 = sk_ssse3::RGB_to_BGR1;
+ gray_to_RGB1 = sk_ssse3::gray_to_RGB1;
}
}
diff --git a/src/opts/SkSwizzler_opts.h b/src/opts/SkSwizzler_opts.h
index 14960f3..612700e 100644
--- a/src/opts/SkSwizzler_opts.h
+++ b/src/opts/SkSwizzler_opts.h
@@ -88,6 +88,16 @@
}
}
+static void gray_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
+ const uint8_t* src = (const uint8_t*)vsrc;
+ for (int i = 0; i < count; i++) {
+ dst[i] = (uint32_t)0xFF << 24
+ | (uint32_t)src[i] << 16
+ | (uint32_t)src[i] << 8
+ | (uint32_t)src[i] << 0;
+ }
+}
+
#if defined(SK_ARM_HAS_NEON)
// Rounded divide by 255, (x + 127) / 255
@@ -260,6 +270,47 @@
insert_alpha_should_swaprb<true>(dst, src, count);
}
+static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
+ const uint8_t* src = (const uint8_t*) vsrc;
+ while (count >= 16) {
+ // Load 16 pixels.
+ uint8x16_t gray = vld1q_u8(src);
+
+ // Set each of the color channels.
+ uint8x16x4_t rgba;
+ rgba.val[0] = gray;
+ rgba.val[1] = gray;
+ rgba.val[2] = gray;
+ rgba.val[3] = vdupq_n_u8(0xFF);
+
+ // Store 16 pixels.
+ vst4q_u8((uint8_t*) dst, rgba);
+ src += 16;
+ dst += 16;
+ count -= 16;
+ }
+
+ if (count >= 8) {
+ // Load 8 pixels.
+ uint8x8_t gray = vld1_u8(src);
+
+ // Set each of the color channels.
+ uint8x8x4_t rgba;
+ rgba.val[0] = gray;
+ rgba.val[1] = gray;
+ rgba.val[2] = gray;
+ rgba.val[3] = vdup_n_u8(0xFF);
+
+ // Store 8 pixels.
+ vst4_u8((uint8_t*) dst, rgba);
+ src += 8;
+ dst += 8;
+ count -= 8;
+ }
+
+ gray_to_RGB1_portable(dst, src, count);
+}
+
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
template <bool kSwapRB>
@@ -401,6 +452,10 @@
insert_alpha_should_swaprb<true>(dst, src, count);
}
+static void gray_to_RGB1(uint32_t dst[], const void* src, int count) {
+ gray_to_RGB1_portable(dst, src, count);
+}
+
#else
static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
@@ -423,6 +478,10 @@
RGB_to_BGR1_portable(dst, src, count);
}
+static void gray_to_RGB1(uint32_t dst[], const void* src, int count) {
+ gray_to_RGB1_portable(dst, src, count);
+}
+
#endif
}