NEON optimizations for gray -> RGBA (or BGRA) conversions

Swizzle Bench Runtime
Nexus 6P 0.32x
Nexus 9  0.89x

PNG Decode Time (for test set of gray encoded PNGs)
Nexus 6P 0.88x
Nexus 9  0.91x

BUG=skia:4767
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1656383002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review URL: https://codereview.chromium.org/1656383002
diff --git a/bench/SwizzleBench.cpp b/bench/SwizzleBench.cpp
index c78f2c9..0f85b59 100644
--- a/bench/SwizzleBench.cpp
+++ b/bench/SwizzleBench.cpp
@@ -32,3 +32,4 @@
 DEF_BENCH(return new SwizzleBench("SkOpts::RGBA_to_BGRA", SkOpts::RGBA_to_BGRA));
 DEF_BENCH(return new SwizzleBench("SkOpts::RGB_to_RGB1",  SkOpts::RGB_to_RGB1));
 DEF_BENCH(return new SwizzleBench("SkOpts::RGB_to_BGR1",  SkOpts::RGB_to_BGR1));
+DEF_BENCH(return new SwizzleBench("SkOpts::gray_to_RGB1", SkOpts::gray_to_RGB1));
diff --git a/src/codec/SkSwizzler.cpp b/src/codec/SkSwizzler.cpp
index 7865184..fa93a6e 100644
--- a/src/codec/SkSwizzler.cpp
+++ b/src/codec/SkSwizzler.cpp
@@ -270,6 +270,19 @@
     }
 }
 
+static void fast_swizzle_gray_to_n32(
+        void* dst, const uint8_t* src, int width, int bpp, int deltaSrc, int offset,
+        const SkPMColor ctable[]) {
+
+    // This function must not be called if we are sampling.  If we are not
+    // sampling, deltaSrc should equal bpp.
+    SkASSERT(deltaSrc == bpp);
+
+    // Note that there is no need to distinguish between RGB and BGR.
+    // Each color channel will get the same value.
+    SkOpts::gray_to_RGB1((uint32_t*) dst, src + offset, width);
+}
+
 static void swizzle_gray_to_565(
         void* SK_RESTRICT dstRow, const uint8_t* SK_RESTRICT src, int dstWidth,
         int bytesPerPixel, int deltaSrc, int offset, const SkPMColor ctable[]) {
@@ -639,6 +652,7 @@
             switch (dstInfo.colorType()) {
                 case kN32_SkColorType:
                     proc = &swizzle_gray_to_n32;
+                    fastProc = &fast_swizzle_gray_to_n32;
                     break;
                 case kGray_8_SkColorType:
                     proc = &sample1;
diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp
index 669401b..bce6ee1 100644
--- a/src/core/SkOpts.cpp
+++ b/src/core/SkOpts.cpp
@@ -84,6 +84,7 @@
     decltype(RGBA_to_bgrA) RGBA_to_bgrA = sk_default::RGBA_to_bgrA;
     decltype(RGB_to_RGB1)  RGB_to_RGB1  = sk_default::RGB_to_RGB1;
     decltype(RGB_to_BGR1)  RGB_to_BGR1  = sk_default::RGB_to_BGR1;
+    decltype(gray_to_RGB1) gray_to_RGB1 = sk_default::gray_to_RGB1;
 
     // Each Init_foo() is defined in src/opts/SkOpts_foo.cpp.
     void Init_ssse3();
diff --git a/src/core/SkOpts.h b/src/core/SkOpts.h
index 41ad8eb..b5286e4 100644
--- a/src/core/SkOpts.h
+++ b/src/core/SkOpts.h
@@ -61,7 +61,8 @@
                         RGBA_to_rgbA,  // i.e. just premultiply
                         RGBA_to_bgrA,  // i.e. swap RB and premultiply
                         RGB_to_RGB1,   // i.e. insert an opaque alpha
-                        RGB_to_BGR1;   // i.e. swap RB and insert an opaque alpha
+                        RGB_to_BGR1,   // i.e. swap RB and insert an opaque alpha
+                        gray_to_RGB1;  // i.e. set color channels to same value + an opaque alpha
 }
 
 #endif//SkOpts_DEFINED
diff --git a/src/opts/SkOpts_neon.cpp b/src/opts/SkOpts_neon.cpp
index dcb057e..79d3140 100644
--- a/src/opts/SkOpts_neon.cpp
+++ b/src/opts/SkOpts_neon.cpp
@@ -52,5 +52,6 @@
         RGBA_to_bgrA = sk_neon::RGBA_to_bgrA;
         RGB_to_RGB1  = sk_neon::RGB_to_RGB1;
         RGB_to_BGR1  = sk_neon::RGB_to_BGR1;
+        gray_to_RGB1 = sk_neon::gray_to_RGB1;
     }
 }
diff --git a/src/opts/SkOpts_ssse3.cpp b/src/opts/SkOpts_ssse3.cpp
index 23fdffb..22eda58 100644
--- a/src/opts/SkOpts_ssse3.cpp
+++ b/src/opts/SkOpts_ssse3.cpp
@@ -23,5 +23,6 @@
         RGBA_to_bgrA = sk_ssse3::RGBA_to_bgrA;
         RGB_to_RGB1  = sk_ssse3::RGB_to_RGB1;
         RGB_to_BGR1  = sk_ssse3::RGB_to_BGR1;
+        gray_to_RGB1 = sk_ssse3::gray_to_RGB1;
     }
 }
diff --git a/src/opts/SkSwizzler_opts.h b/src/opts/SkSwizzler_opts.h
index 14960f3..612700e 100644
--- a/src/opts/SkSwizzler_opts.h
+++ b/src/opts/SkSwizzler_opts.h
@@ -88,6 +88,16 @@
     }
 }
 
+static void gray_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
+    const uint8_t* src = (const uint8_t*)vsrc;
+    for (int i = 0; i < count; i++) {
+        dst[i] = (uint32_t)0xFF   << 24
+               | (uint32_t)src[i] << 16
+               | (uint32_t)src[i] <<  8
+               | (uint32_t)src[i] <<  0;
+    }
+}
+
 #if defined(SK_ARM_HAS_NEON)
 
 // Rounded divide by 255, (x + 127) / 255
@@ -260,6 +270,47 @@
     insert_alpha_should_swaprb<true>(dst, src, count);
 }
 
+static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
+    const uint8_t* src = (const uint8_t*) vsrc;
+    while (count >= 16) {
+        // Load 16 pixels.
+        uint8x16_t gray = vld1q_u8(src);
+
+        // Set each of the color channels.
+        uint8x16x4_t rgba;
+        rgba.val[0] = gray;
+        rgba.val[1] = gray;
+        rgba.val[2] = gray;
+        rgba.val[3] = vdupq_n_u8(0xFF);
+
+        // Store 16 pixels.
+        vst4q_u8((uint8_t*) dst, rgba);
+        src += 16;
+        dst += 16;
+        count -= 16;
+    }
+
+    if (count >= 8) {
+        // Load 8 pixels.
+        uint8x8_t gray = vld1_u8(src);
+
+        // Set each of the color channels.
+        uint8x8x4_t rgba;
+        rgba.val[0] = gray;
+        rgba.val[1] = gray;
+        rgba.val[2] = gray;
+        rgba.val[3] = vdup_n_u8(0xFF);
+
+        // Store 8 pixels.
+        vst4_u8((uint8_t*) dst, rgba);
+        src += 8;
+        dst += 8;
+        count -= 8;
+    }
+
+    gray_to_RGB1_portable(dst, src, count);
+}
+
 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
 
 template <bool kSwapRB>
@@ -401,6 +452,10 @@
     insert_alpha_should_swaprb<true>(dst, src, count);
 }
 
+static void gray_to_RGB1(uint32_t dst[], const void* src, int count) {
+    gray_to_RGB1_portable(dst, src, count);
+}
+
 #else
 
 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
@@ -423,6 +478,10 @@
     RGB_to_BGR1_portable(dst, src, count);
 }
 
+static void gray_to_RGB1(uint32_t dst[], const void* src, int count) {
+    gray_to_RGB1_portable(dst, src, count);
+}
+
 #endif
 
 }