Fix SIMD compilation error for GCC 8.x and below
GCC versions older than 9.1 don't have _mm_loadu_si64, but they do
have _mm_loadl_epi64 which is equivalent.
https://godbolt.org/z/bbfKr44e7
The name of the intrinsic seems to indicate it should be aligned,
however the Intel documentation does not specify this [1], GCC
removed the alignment assertion from the headers [2], random blogs
suggest this [3] and the assembly code generated appears to be
the same.
While I was in here, I made some alignment checks a bit more
readable.
[1] https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64&ig_expand=4044
"Load 64-bit integer from memory into the first element of dst."
[2] https://gcc.gnu.org/git/?p=gcc.git;a=blobdiff;f=gcc/config/i386/emmintrin.h;h=d5aa46a16f73bb88e11a6f84c5495c113d86127f;hp=b299cbc8178cbae765b8997b5032fe8e96c07657;hb=865fc9684661c46589a30c6021cf44de560940f3;hpb=846fbd1e2b11a524563058d54f59447620497fde
[3] https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/ "...and there is no alignment requirement."
Change-Id: Ic6069048c490d7300f34cf8372455cd31d11d891
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/924839
Commit-Queue: Kaylee Lubick <kjlubick@google.com>
Reviewed-by: Ben Wagner <bungeman@google.com>
diff --git a/src/core/SkBlitter_ARGB32.cpp b/src/core/SkBlitter_ARGB32.cpp
index 8299749..480c249 100644
--- a/src/core/SkBlitter_ARGB32.cpp
+++ b/src/core/SkBlitter_ARGB32.cpp
@@ -13,6 +13,7 @@
#include "include/core/SkRect.h"
#include "include/core/SkTypes.h"
#include "include/private/SkColorData.h"
+#include "include/private/base/SkAlign.h"
#include "include/private/base/SkCPUTypes.h"
#include "include/private/base/SkDebug.h"
#include "include/private/base/SkMalloc.h"
@@ -352,8 +353,8 @@
srcA = SkAlpha255To256(srcA);
if (width >= 4) {
- SkASSERT(((size_t)dst & 0x03) == 0);
- while (((size_t)dst & 0x0F) != 0) {
+ SkASSERT(SkIsAlign4((uintptr_t) dst));
+ while (!SkIsAlign16((uintptr_t) dst)) {
*dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
mask++;
dst++;
@@ -372,7 +373,8 @@
// Load four destination pixels into dst_sse.
__m128i dst_sse = _mm_load_si128(d);
// Load four 16-bit masks into lower half of mask_sse.
- __m128i mask_sse = _mm_loadu_si64(mask);
+ // mask does *not* actually need to be 16 byte alligned to use this command
+ __m128i mask_sse = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(mask));
// Check whether masks are equal to 0 and get the highest bit
// of each byte of result, if masks are all zero, we will get
@@ -410,7 +412,7 @@
}
void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
- SkColor src, int width, SkPMColor opaqueDst) {
+ SkColor src, int width, SkPMColor opaqueDst) {
if (width <= 0) {
return;
}
@@ -420,8 +422,8 @@
int srcB = SkColorGetB(src);
if (width >= 4) {
- SkASSERT(((size_t)dst & 0x03) == 0);
- while (((size_t)dst & 0x0F) != 0) {
+ SkASSERT(SkIsAlign4((uintptr_t) dst));
+ while (!SkIsAlign16((uintptr_t) dst)) {
*dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
mask++;
dst++;
@@ -438,7 +440,8 @@
// Load four destination pixels into dst_sse.
__m128i dst_sse = _mm_load_si128(d);
// Load four 16-bit masks into lower half of mask_sse.
- __m128i mask_sse = _mm_loadu_si64(mask);
+ // mask does *not* actually need to be 16 byte alligned to use this command
+ __m128i mask_sse = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(mask));
// Check whether masks are equal to 0 and get the highest bit
// of each byte of result, if masks are all zero, we will get
@@ -874,8 +877,8 @@
srcA = SkAlpha255To256(srcA);
if (width >= 8) {
- SkASSERT(((size_t)dst & 0x03) == 0);
- while (((size_t)dst & 0x0F) != 0) {
+ SkASSERT(SkIsAlign4((uintptr_t) dst));
+ while (!SkIsAlign16((uintptr_t) dst)) {
*dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
mask++;
dst++;
@@ -941,8 +944,8 @@
__m256i xv_zero = __lasx_xvldi(0);
if (width >= 8) {
- SkASSERT(((size_t)dst & 0x03) == 0);
- while (((size_t)dst & 0x0F) != 0) {
+ SkASSERT(SkIsAlign4((uintptr_t) dst));
+ while (!SkIsAlign16((uintptr_t) dst)) {
*dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
mask++;
dst++;
@@ -1231,8 +1234,8 @@
srcA = SkAlpha255To256(srcA);
if (width >= 4) {
- SkASSERT(((size_t)dst & 0x03) == 0);
- while (((size_t)dst & 0x0F) != 0) {
+ SkASSERT(SkIsAlign4((uintptr_t) dst));
+ while (!SkIsAlign16((uintptr_t) dst)) {
*dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
mask++;
dst++;
@@ -1297,8 +1300,8 @@
__m128i v_zero = __lsx_vldi(0);
if (width >= 4) {
- SkASSERT(((size_t)dst & 0x03) == 0);
- while (((size_t)dst & 0x0F) != 0) {
+ SkASSERT(SkIsAlign4((uintptr_t) dst));
+ while (!SkIsAlign16((uintptr_t) dst)) {
*dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
mask++;
dst++;