restore vmull_u8() in color32() vmull_u8() does u8 * u8 -> u16, 8 at a time. This keeps the loop as tight as possible in NEON, basically {load,mull,addhn,store,loop}. Drop N to 4 pixels at at time to make this easier. Depending on how performance charts go, I may circle back to bring this back up to 8. Bug: chromium:952502 Change-Id: I17ba6b60c0cc6c6da71b05a4af269d87d76672b5 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/208140 Auto-Submit: Mike Klein <mtklein@google.com> Reviewed-by: Michael Ludwig <michaelludwig@google.com> Commit-Queue: Mike Klein <mtklein@google.com>

commit: 3d50730e1246d9350e5fdc3f356cfb235fe9e7e7 [log] [tgz]
author: Mike Klein <mtklein@google.com> Mon Apr 15 08:56:06 2019 -0500
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> Mon Apr 15 18:09:39 2019 +0000
tree: 3c9ac61e7214500c46df2f6406dc28dfbbaa87c3
parent: a71b4ce34636a91fc9a4cb492b99740c30a42c72 [diff]
diff --git a/src/opts/SkBlitRow_opts.h b/src/opts/SkBlitRow_opts.h
index 2bdcf7b..381c171 100644
--- a/src/opts/SkBlitRow_opts.h
+++ b/src/opts/SkBlitRow_opts.h

@@ -41,9 +41,35 @@
 
 namespace SK_OPTS_NS {
 
+#if defined(SK_ARM_HAS_NEON)
+    // With NEON we can do eight u8*u8 -> u16 in one instruction, vmull_u8 (read, mul-long).
+    // TODO(mtklein): I wish I could make this a bit prettier and still get ideal codegen.
+    static inline skvx::Vec<4,uint16_t> mull(skvx::Vec<4,uint8_t> x, skvx::Vec<4,uint8_t> y) {
+        return skvx::to_vec<8,uint16_t>( vmull_u8(skvx::to_vext(skvx::join(x,x)),
+                                                  skvx::to_vext(skvx::join(y,y))) )
+            .lo;
+    }
+    static inline skvx::Vec<16,uint16_t> mull(skvx::Vec<16,uint8_t> x, skvx::Vec<16,uint8_t> y) {
+        uint16x8_t lo = vmull_u8( skvx::to_vext(x.lo), skvx::to_vext(y.lo) ),
+                   hi = vmull_u8( skvx::to_vext(x.hi), skvx::to_vext(y.hi) );
+        // TODO: why can't I get skvx::join() to generate the same code as this?
+        skvx::Vec<16,uint16_t> r;
+        memcpy(&r.lo, &lo, sizeof(lo));
+        memcpy(&r.hi, &hi, sizeof(hi));
+        return r;
+    }
+#else
+    // Nothing special when we don't have NEON... just cast up to 16-bit and multiply.
+    template <int N>
+    static inline skvx::Vec<N,uint16_t> mull(skvx::Vec<N,uint8_t> x, skvx::Vec<N,uint8_t> y) {
+        return skvx::cast<uint16_t>(x)
+             * skvx::cast<uint16_t>(y);
+    }
+#endif
+
 // Blend constant color over count src pixels, writing into dst.
 inline void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor color) {
-    constexpr int N = 8;  // 4, 16 also reasonable choices
+    constexpr int N = 4;  // 8, 16 also reasonable choices
     using U32 = skvx::Vec<  N, uint32_t>;
     using U16 = skvx::Vec<4*N, uint16_t>;
     using U8  = skvx::Vec<4*N, uint8_t>;
@@ -55,10 +81,10 @@
 
         // (src * invA + (color << 8) + 128) >> 8
         // Should all fit in 16 bits.
-        // TODO(mtklein): can we do src * invA with umull on ARM?
-        U16 s = skvx::cast<uint16_t>(skvx::bit_pun<U8>(src)),
-            c = skvx::cast<uint16_t>(skvx::bit_pun<U8>(U32(color))),
-            d = (s * invA + (c << 8) + 128)>>8;
+        U8 s = skvx::bit_pun<U8>(src),
+           a = U8(invA);
+        U16 c = skvx::cast<uint16_t>(skvx::bit_pun<U8>(U32(color))),
+            d = (mull(s,a) + (c << 8) + 128)>>8;
         return skvx::bit_pun<U32>(skvx::cast<uint8_t>(d));
     };
commit	3d50730e1246d9350e5fdc3f356cfb235fe9e7e7	[log] [tgz]
author	Mike Klein <mtklein@google.com>	Mon Apr 15 08:56:06 2019 -0500
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	Mon Apr 15 18:09:39 2019 +0000
tree	3c9ac61e7214500c46df2f6406dc28dfbbaa87c3
parent	a71b4ce34636a91fc9a4cb492b99740c30a42c72 [diff]