remove templates from SkSwizzler_opts

Spin off the safer part of the SkOpts CL I had to revert.
Should we try that again, having this in place will make
it easier.

Change-Id: Ic34ce2f23c10466cf1036f9651a544d308f4edec
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/294294
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/src/opts/SkSwizzler_opts.h b/src/opts/SkSwizzler_opts.h
index be4f1d6..08e90ef 100644
--- a/src/opts/SkSwizzler_opts.h
+++ b/src/opts/SkSwizzler_opts.h
@@ -189,8 +189,7 @@
     return div255_round(vmull_u8(x, y));
 }
 
-template <bool kSwapRB>
-static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count) {
+static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
     while (count >= 8) {
         // Load 8 pixels.
         uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
@@ -227,11 +226,11 @@
 }
 
 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
-    premul_should_swapRB<false>(dst, src, count);
+    premul_should_swapRB(false, dst, src, count);
 }
 
 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
-    premul_should_swapRB<true>(dst, src, count);
+    premul_should_swapRB(true, dst, src, count);
 }
 
 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
@@ -267,8 +266,8 @@
     RGBA_to_BGRA_portable(dst, src, count);
 }
 
-template <bool kSwapRB>
-static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int count) {
+static void insert_alpha_should_swaprb(bool kSwapRB,
+                                       uint32_t dst[], const uint8_t* src, int count) {
     while (count >= 16) {
         // Load 16 pixels.
         uint8x16x3_t rgb = vld3q_u8(src);
@@ -321,11 +320,11 @@
 }
 
 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
-    insert_alpha_should_swaprb<false>(dst, src, count);
+    insert_alpha_should_swaprb(false, dst, src, count);
 }
 
 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
-    insert_alpha_should_swaprb<true>(dst, src, count);
+    insert_alpha_should_swaprb(true, dst, src, count);
 }
 
 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
@@ -368,8 +367,7 @@
     gray_to_RGB1_portable(dst, src, count);
 }
 
-template <bool kPremul>
-static void expand_grayA(uint32_t dst[], const uint8_t* src, int count) {
+static void expand_grayA(bool kPremul, uint32_t dst[], const uint8_t* src, int count) {
     while (count >= 16) {
         // Load 16 pixels.
         uint8x16x2_t ga = vld2q_u8(src);
@@ -423,16 +421,15 @@
 }
 
 /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
-    expand_grayA<false>(dst, src, count);
+    expand_grayA(false, dst, src, count);
 }
 
 /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
-    expand_grayA<true>(dst, src, count);
+    expand_grayA(true, dst, src, count);
 }
 
 enum Format { kRGB1, kBGR1 };
-template <Format format>
-static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) {
+static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
     while (count >= 8) {
         // Load 8 cmyk pixels.
         uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
@@ -470,11 +467,11 @@
 }
 
 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
-    inverted_cmyk_to<kRGB1>(dst, src, count);
+    inverted_cmyk_to(kRGB1, dst, src, count);
 }
 
 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
-    inverted_cmyk_to<kBGR1>(dst, src, count);
+    inverted_cmyk_to(kBGR1, dst, src, count);
 }
 
 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
@@ -489,10 +486,9 @@
     return _mm256_mulhi_epu16(_mm256_add_epi16(_mm256_mullo_epi16(x, y), _128), _257);
 }
 
-template <bool kSwapRB>
-static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count) {
+static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
 
-    auto premul8 = [](__m256i* lo, __m256i* hi) {
+    auto premul8 = [=](__m256i* lo, __m256i* hi) {
         const __m256i zeros = _mm256_setzero_si256();
         __m256i planar;
         if (kSwapRB) {
@@ -560,11 +556,11 @@
 }
 
 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
-    premul_should_swapRB<false>(dst, src, count);
+    premul_should_swapRB(false, dst, src, count);
 }
 
 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
-    premul_should_swapRB<true>(dst, src, count);
+    premul_should_swapRB(true, dst, src, count);
 }
 
 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
@@ -585,8 +581,8 @@
 }
 
 // Use SSSE3 impl as AVX2 impl regresses performance on some platforms.
-template <bool kSwapRB>
-static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int count) {
+static void insert_alpha_should_swaprb(bool kSwapRB,
+                                       uint32_t dst[], const uint8_t* src, int count) {
     const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
     __m128i expand;
     const uint8_t X = 0xFF; // Used a placeholder.  The value of X is irrelevant.
@@ -620,11 +616,11 @@
 }
 
 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
-    insert_alpha_should_swaprb<false>(dst, src, count);
+    insert_alpha_should_swaprb(false, dst, src, count);
 }
 
 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
-    insert_alpha_should_swaprb<true>(dst, src, count);
+    insert_alpha_should_swaprb(true, dst, src, count);
 }
 
 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
@@ -737,9 +733,8 @@
 }
 
 enum Format { kRGB1, kBGR1 };
-template <Format format>
-static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) {
-    auto convert8 = [](__m256i* lo, __m256i* hi) {
+static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
+    auto convert8 = [=](__m256i* lo, __m256i* hi) {
         const __m256i zeros = _mm256_setzero_si256();
         __m256i planar;
         if (kBGR1 == format) {
@@ -808,11 +803,11 @@
 }
 
 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
-    inverted_cmyk_to<kRGB1>(dst, src, count);
+    inverted_cmyk_to(kRGB1, dst, src, count);
 }
 
 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
-    inverted_cmyk_to<kBGR1>(dst, src, count);
+    inverted_cmyk_to(kBGR1, dst, src, count);
 }
 
 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
@@ -827,10 +822,9 @@
     return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
 }
 
-template <bool kSwapRB>
-static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count) {
+static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
 
-    auto premul8 = [](__m128i* lo, __m128i* hi) {
+    auto premul8 = [=](__m128i* lo, __m128i* hi) {
         const __m128i zeros = _mm_setzero_si128();
         __m128i planar;
         if (kSwapRB) {
@@ -896,11 +890,11 @@
 }
 
 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
-    premul_should_swapRB<false>(dst, src, count);
+    premul_should_swapRB(false, dst, src, count);
 }
 
 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
-    premul_should_swapRB<true>(dst, src, count);
+    premul_should_swapRB(true, dst, src, count);
 }
 
 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
@@ -919,8 +913,8 @@
     RGBA_to_BGRA_portable(dst, src, count);
 }
 
-template <bool kSwapRB>
-static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int count) {
+static void insert_alpha_should_swaprb(bool kSwapRB,
+                                       uint32_t dst[], const uint8_t* src, int count) {
     const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
     __m128i expand;
     const uint8_t X = 0xFF; // Used a placeholder.  The value of X is irrelevant.
@@ -953,11 +947,11 @@
 }
 
 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
-    insert_alpha_should_swaprb<false>(dst, src, count);
+    insert_alpha_should_swaprb(false, dst, src, count);
 }
 
 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
-    insert_alpha_should_swaprb<true>(dst, src, count);
+    insert_alpha_should_swaprb(true, dst, src, count);
 }
 
 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
@@ -1038,9 +1032,8 @@
 }
 
 enum Format { kRGB1, kBGR1 };
-template <Format format>
-static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) {
-    auto convert8 = [](__m128i* lo, __m128i* hi) {
+static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
+    auto convert8 = [=](__m128i* lo, __m128i* hi) {
         const __m128i zeros = _mm_setzero_si128();
         __m128i planar;
         if (kBGR1 == format) {
@@ -1105,11 +1098,11 @@
 }
 
 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
-    inverted_cmyk_to<kRGB1>(dst, src, count);
+    inverted_cmyk_to(kRGB1, dst, src, count);
 }
 
 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
-    inverted_cmyk_to<kBGR1>(dst, src, count);
+    inverted_cmyk_to(kBGR1, dst, src, count);
 }
 
 #else