restore vmull_u8() in color32()
vmull_u8() does u8 * u8 -> u16, 8 at a time. This keeps the loop as
tight as possible in NEON, basically {load,mull,addhn,store,loop}.
Drop N to 4 pixels at at time to make this easier. Depending on how
performance charts go, I may circle back to bring this back up to 8.
Bug: chromium:952502
Change-Id: I17ba6b60c0cc6c6da71b05a4af269d87d76672b5
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/208140
Auto-Submit: Mike Klein <mtklein@google.com>
Reviewed-by: Michael Ludwig <michaelludwig@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/src/opts/SkBlitRow_opts.h b/src/opts/SkBlitRow_opts.h
index 2bdcf7b..381c171 100644
--- a/src/opts/SkBlitRow_opts.h
+++ b/src/opts/SkBlitRow_opts.h
@@ -41,9 +41,35 @@
namespace SK_OPTS_NS {
+#if defined(SK_ARM_HAS_NEON)
+ // With NEON we can do eight u8*u8 -> u16 in one instruction, vmull_u8 (read, mul-long).
+ // TODO(mtklein): I wish I could make this a bit prettier and still get ideal codegen.
+ static inline skvx::Vec<4,uint16_t> mull(skvx::Vec<4,uint8_t> x, skvx::Vec<4,uint8_t> y) {
+ return skvx::to_vec<8,uint16_t>( vmull_u8(skvx::to_vext(skvx::join(x,x)),
+ skvx::to_vext(skvx::join(y,y))) )
+ .lo;
+ }
+ static inline skvx::Vec<16,uint16_t> mull(skvx::Vec<16,uint8_t> x, skvx::Vec<16,uint8_t> y) {
+ uint16x8_t lo = vmull_u8( skvx::to_vext(x.lo), skvx::to_vext(y.lo) ),
+ hi = vmull_u8( skvx::to_vext(x.hi), skvx::to_vext(y.hi) );
+ // TODO: why can't I get skvx::join() to generate the same code as this?
+ skvx::Vec<16,uint16_t> r;
+ memcpy(&r.lo, &lo, sizeof(lo));
+ memcpy(&r.hi, &hi, sizeof(hi));
+ return r;
+ }
+#else
+ // Nothing special when we don't have NEON... just cast up to 16-bit and multiply.
+ template <int N>
+ static inline skvx::Vec<N,uint16_t> mull(skvx::Vec<N,uint8_t> x, skvx::Vec<N,uint8_t> y) {
+ return skvx::cast<uint16_t>(x)
+ * skvx::cast<uint16_t>(y);
+ }
+#endif
+
// Blend constant color over count src pixels, writing into dst.
inline void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor color) {
- constexpr int N = 8; // 4, 16 also reasonable choices
+ constexpr int N = 4; // 8, 16 also reasonable choices
using U32 = skvx::Vec< N, uint32_t>;
using U16 = skvx::Vec<4*N, uint16_t>;
using U8 = skvx::Vec<4*N, uint8_t>;
@@ -55,10 +81,10 @@
// (src * invA + (color << 8) + 128) >> 8
// Should all fit in 16 bits.
- // TODO(mtklein): can we do src * invA with umull on ARM?
- U16 s = skvx::cast<uint16_t>(skvx::bit_pun<U8>(src)),
- c = skvx::cast<uint16_t>(skvx::bit_pun<U8>(U32(color))),
- d = (s * invA + (c << 8) + 128)>>8;
+ U8 s = skvx::bit_pun<U8>(src),
+ a = U8(invA);
+ U16 c = skvx::cast<uint16_t>(skvx::bit_pun<U8>(U32(color))),
+ d = (mull(s,a) + (c << 8) + 128)>>8;
return skvx::bit_pun<U32>(skvx::cast<uint8_t>(d));
};