Fix SIMD compilation error for GCC 8.x and below

GCC versions older than 9.1 don't have _mm_loadu_si64, but they do
have _mm_loadl_epi64 which is equivalent.
https://godbolt.org/z/bbfKr44e7

The name of the intrinsic seems to indicate it should be aligned,
however the Intel documentation does not specify this [1], GCC
removed the alignment assertion from the headers [2], random blogs
suggest this [3] and the assembly code generated appears to be
the same.

While I was in here, I made some alignment checks a bit more
readable.

[1] https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64&ig_expand=4044
"Load 64-bit integer from memory into the first element of dst."
[2] https://gcc.gnu.org/git/?p=gcc.git;a=blobdiff;f=gcc/config/i386/emmintrin.h;h=d5aa46a16f73bb88e11a6f84c5495c113d86127f;hp=b299cbc8178cbae765b8997b5032fe8e96c07657;hb=865fc9684661c46589a30c6021cf44de560940f3;hpb=846fbd1e2b11a524563058d54f59447620497fde
[3] https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/ "...and there is no alignment requirement."

Change-Id: Ic6069048c490d7300f34cf8372455cd31d11d891
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/924839
Commit-Queue: Kaylee Lubick <kjlubick@google.com>
Reviewed-by: Ben Wagner <bungeman@google.com>
diff --git a/src/core/SkBlitter_ARGB32.cpp b/src/core/SkBlitter_ARGB32.cpp
index 8299749..480c249 100644
--- a/src/core/SkBlitter_ARGB32.cpp
+++ b/src/core/SkBlitter_ARGB32.cpp
@@ -13,6 +13,7 @@
 #include "include/core/SkRect.h"
 #include "include/core/SkTypes.h"
 #include "include/private/SkColorData.h"
+#include "include/private/base/SkAlign.h"
 #include "include/private/base/SkCPUTypes.h"
 #include "include/private/base/SkDebug.h"
 #include "include/private/base/SkMalloc.h"
@@ -352,8 +353,8 @@
         srcA = SkAlpha255To256(srcA);
 
         if (width >= 4) {
-            SkASSERT(((size_t)dst & 0x03) == 0);
-            while (((size_t)dst & 0x0F) != 0) {
+            SkASSERT(SkIsAlign4((uintptr_t) dst));
+            while (!SkIsAlign16((uintptr_t) dst)) {
                 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
                 mask++;
                 dst++;
@@ -372,7 +373,8 @@
                 // Load four destination pixels into dst_sse.
                 __m128i dst_sse = _mm_load_si128(d);
                 // Load four 16-bit masks into lower half of mask_sse.
-                __m128i mask_sse = _mm_loadu_si64(mask);
+                // mask does *not* actually need to be 16 byte alligned to use this command
+                __m128i mask_sse = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(mask));
 
                 // Check whether masks are equal to 0 and get the highest bit
                 // of each byte of result, if masks are all zero, we will get
@@ -410,7 +412,7 @@
     }
 
     void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
-                                   SkColor src, int width, SkPMColor opaqueDst) {
+                               SkColor src, int width, SkPMColor opaqueDst) {
         if (width <= 0) {
             return;
         }
@@ -420,8 +422,8 @@
         int srcB = SkColorGetB(src);
 
         if (width >= 4) {
-            SkASSERT(((size_t)dst & 0x03) == 0);
-            while (((size_t)dst & 0x0F) != 0) {
+            SkASSERT(SkIsAlign4((uintptr_t) dst));
+            while (!SkIsAlign16((uintptr_t) dst)) {
                 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
                 mask++;
                 dst++;
@@ -438,7 +440,8 @@
                 // Load four destination pixels into dst_sse.
                 __m128i dst_sse = _mm_load_si128(d);
                 // Load four 16-bit masks into lower half of mask_sse.
-                __m128i mask_sse = _mm_loadu_si64(mask);
+                // mask does *not* actually need to be 16 byte alligned to use this command
+                __m128i mask_sse = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(mask));
 
                 // Check whether masks are equal to 0 and get the highest bit
                 // of each byte of result, if masks are all zero, we will get
@@ -874,8 +877,8 @@
 
         srcA = SkAlpha255To256(srcA);
         if (width >= 8) {
-            SkASSERT(((size_t)dst & 0x03) == 0);
-            while (((size_t)dst & 0x0F) != 0) {
+            SkASSERT(SkIsAlign4((uintptr_t) dst));
+            while (!SkIsAlign16((uintptr_t) dst)) {
                 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
                 mask++;
                 dst++;
@@ -941,8 +944,8 @@
         __m256i xv_zero = __lasx_xvldi(0);
 
         if (width >= 8) {
-            SkASSERT(((size_t)dst & 0x03) == 0);
-            while (((size_t)dst & 0x0F) != 0) {
+            SkASSERT(SkIsAlign4((uintptr_t) dst));
+            while (!SkIsAlign16((uintptr_t) dst)) {
                 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
                 mask++;
                 dst++;
@@ -1231,8 +1234,8 @@
 
         srcA = SkAlpha255To256(srcA);
         if (width >= 4) {
-            SkASSERT(((size_t)dst & 0x03) == 0);
-            while (((size_t)dst & 0x0F) != 0) {
+            SkASSERT(SkIsAlign4((uintptr_t) dst));
+            while (!SkIsAlign16((uintptr_t) dst)) {
                 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
                 mask++;
                 dst++;
@@ -1297,8 +1300,8 @@
         __m128i v_zero = __lsx_vldi(0);
 
         if (width >= 4) {
-            SkASSERT(((size_t)dst & 0x03) == 0);
-            while (((size_t)dst & 0x0F) != 0) {
+            SkASSERT(SkIsAlign4((uintptr_t) dst));
+            while (!SkIsAlign16((uintptr_t) dst)) {
                 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
                 mask++;
                 dst++;