Reland "align skvx::Vec<N,T> to N*sizeof(T)"

This is a reland of e3b110dc6e5f7e1c2bdeb16d4d0ca03221c5ee25

PS1 is the original, so best to diff against that.
This is the original with compiler workarounds.

Original change's description:
> align skvx::Vec<N,T> to N*sizeof(T)
>
> This increases the alignment of these vector types.  I would have liked
> to keep the alignment minimal, but it's probably no big deal either way.
>
> In terms of code generation, it doesn't make much difference for x86 or
> ARMv8, but it seems hugely important for good ARMv7 NEON code.  It's a
> ~10x difference for the bench I've been playing around with that spends
> most of its time in that SkOpts::blit_row_color32 routine.
>
> Bug: chromium:952502
> Change-Id: Ib12caad6b9b3f3f6e821ed70bfb57099db37b15f
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/208581
> Commit-Queue: Michael Ludwig <michaelludwig@google.com>
> Reviewed-by: Michael Ludwig <michaelludwig@google.com>
> Auto-Submit: Mike Klein <mtklein@google.com>

Bug: chromium:952502
Cq-Include-Trybots: skia.primary:Test-Win2016-MSVC-GCE-CPU-AVX2-x86-Release-All,Build-Debian9-GCC-mips64el-Debug
Change-Id: Ief99e14ab4de5a56840ed6bb326cf7669c51dc97
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/208681
Reviewed-by: Mike Klein <mtklein@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/include/private/SkVx.h b/include/private/SkVx.h
index a05301f..90b0cac 100644
--- a/include/private/SkVx.h
+++ b/include/private/SkVx.h
@@ -16,7 +16,10 @@
 //
 // We've also fixed a few of the caveats that used to make SkNx awkward to work
 // with across translation units.  skvx::Vec<N,T> always has N*sizeof(T) size
-// and alignof(T) alignment and is safe to use across translation units freely.
+// and alignment[1][2] and is safe to use across translation units freely.
+//
+// [1] Ideally we'd only align to T, but that tanks ARMv7 NEON codegen.
+// [2] Some compilers barf if we try to use N*sizeof(T), so instead we leave them at T.
 
 #include "SkTypes.h"         // SK_CPU_SSE_LEVEL*, etc.
 #include <algorithm>         // std::min, std::max
@@ -31,6 +34,16 @@
     #include <arm_neon.h>
 #endif
 
+#if !defined(__clang__) && defined(__GNUC__) && defined(__mips64)
+    // GCC 7 hits an internal compiler error when targeting MIPS64.
+    #define SKVX_ALIGNMENT
+#elif !defined(__clang__) && defined(_MSC_VER) && defined(_M_IX86)
+    // Our SkVx unit tests fail when built by MSVC for 32-bit x86.
+    #define SKVX_ALIGNMENT
+#else
+    #define SKVX_ALIGNMENT alignas(N * sizeof(T))
+#endif
+
 
 namespace skvx {
 
@@ -38,8 +51,9 @@
 // This gives Vec a consistent ABI, letting them pass between files compiled with
 // different instruction sets (e.g. SSE2 and AVX2) without fear of ODR violation.
 template <int N, typename T>
-struct Vec {
-    static_assert((N & (N-1)) == 0, "N must be a power of 2.");
+struct SKVX_ALIGNMENT Vec {
+    static_assert((N & (N-1)) == 0,        "N must be a power of 2.");
+    static_assert(sizeof(T) >= alignof(T), "What kind of crazy T is this?");
 
     Vec<N/2,T> lo, hi;
 
@@ -507,5 +521,6 @@
 #undef SINTU
 #undef SINT
 #undef SIT
+#undef SKVX_ALIGNMENT
 
 #endif//SKVX_DEFINED