Fix some SIMD compiler warnings
diff --git a/internal/cgen/base/pixconv-submodule.c b/internal/cgen/base/pixconv-submodule.c
index f952208..4ba2123 100644
--- a/internal/cgen/base/pixconv-submodule.c
+++ b/internal/cgen/base/pixconv-submodule.c
@@ -546,16 +546,16 @@
const uint8_t* s = src_ptr;
size_t n = len;
- __m128i shuffle = _mm_set_epi8(0x0F, 0x0C, 0x0D, 0x0E, //
- 0x0B, 0x08, 0x09, 0x0A, //
- 0x07, 0x04, 0x05, 0x06, //
- 0x03, 0x00, 0x01, 0x02);
+ __m128i shuffle = _mm_set_epi8(+0x0F, +0x0C, +0x0D, +0x0E, //
+ +0x0B, +0x08, +0x09, +0x0A, //
+ +0x07, +0x04, +0x05, +0x06, //
+ +0x03, +0x00, +0x01, +0x02);
while (n >= 4) {
__m128i x;
- x = _mm_loadu_si128((const void*)s);
+ x = _mm_loadu_si128((const __m128i*)(const void*)s);
x = _mm_shuffle_epi8(x, shuffle);
- _mm_storeu_si128((void*)d, x);
+ _mm_storeu_si128((__m128i*)(void*)d, x);
s += 4 * 4;
d += 4 * 4;
@@ -576,7 +576,7 @@
}
return len;
}
-#endif
+#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
static uint64_t //
wuffs_base__pixel_swizzler__swap_rgbx_bgrx(uint8_t* dst_ptr,
@@ -1760,21 +1760,21 @@
const uint8_t* s = src_ptr;
size_t n = len;
- __m128i shuffle = _mm_set_epi8(0x00, 0x09, 0x0A, 0x0B, //
- 0x00, 0x06, 0x07, 0x08, //
- 0x00, 0x03, 0x04, 0x05, //
- 0x00, 0x00, 0x01, 0x02);
- __m128i or = _mm_set_epi8(0xFF, 0x00, 0x00, 0x00, //
- 0xFF, 0x00, 0x00, 0x00, //
- 0xFF, 0x00, 0x00, 0x00, //
- 0xFF, 0x00, 0x00, 0x00);
+ __m128i shuffle = _mm_set_epi8(+0x00, +0x09, +0x0A, +0x0B, //
+ +0x00, +0x06, +0x07, +0x08, //
+ +0x00, +0x03, +0x04, +0x05, //
+ +0x00, +0x00, +0x01, +0x02);
+ __m128i or_ff = _mm_set_epi8(-0x01, +0x00, +0x00, +0x00, //
+ -0x01, +0x00, +0x00, +0x00, //
+ -0x01, +0x00, +0x00, +0x00, //
+ -0x01, +0x00, +0x00, +0x00);
while (n >= 6) {
__m128i x;
- x = _mm_loadu_si128((const void*)s);
+ x = _mm_loadu_si128((const __m128i*)(const void*)s);
x = _mm_shuffle_epi8(x, shuffle);
- x = _mm_or_si128(x, or);
- _mm_storeu_si128((void*)d, x);
+ x = _mm_or_si128(x, or_ff);
+ _mm_storeu_si128((__m128i*)(void*)d, x);
s += 4 * 3;
d += 4 * 4;
@@ -1797,7 +1797,7 @@
return len;
}
-#endif
+#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
static uint64_t //
wuffs_base__pixel_swizzler__bgrw__rgb(uint8_t* dst_ptr,
@@ -2137,21 +2137,21 @@
const uint8_t* s = src_ptr;
size_t n = len;
- __m128i shuffle = _mm_set_epi8(0x03, 0x03, 0x03, 0x03, //
- 0x02, 0x02, 0x02, 0x02, //
- 0x01, 0x01, 0x01, 0x01, //
- 0x00, 0x00, 0x00, 0x00);
- __m128i or = _mm_set_epi8(0xFF, 0x00, 0x00, 0x00, //
- 0xFF, 0x00, 0x00, 0x00, //
- 0xFF, 0x00, 0x00, 0x00, //
- 0xFF, 0x00, 0x00, 0x00);
+ __m128i shuffle = _mm_set_epi8(+0x03, +0x03, +0x03, +0x03, //
+ +0x02, +0x02, +0x02, +0x02, //
+ +0x01, +0x01, +0x01, +0x01, //
+ +0x00, +0x00, +0x00, +0x00);
+ __m128i or_ff = _mm_set_epi8(-0x01, +0x00, +0x00, +0x00, //
+ -0x01, +0x00, +0x00, +0x00, //
+ -0x01, +0x00, +0x00, +0x00, //
+ -0x01, +0x00, +0x00, +0x00);
while (n >= 4) {
__m128i x;
- x = _mm_cvtsi32_si128(wuffs_base__load_u32le__no_bounds_check(s));
+ x = _mm_cvtsi32_si128((int)(wuffs_base__load_u32le__no_bounds_check(s)));
x = _mm_shuffle_epi8(x, shuffle);
- x = _mm_or_si128(x, or);
- _mm_storeu_si128((void*)d, x);
+ x = _mm_or_si128(x, or_ff);
+ _mm_storeu_si128((__m128i*)(void*)d, x);
s += 4 * 1;
d += 4 * 4;
@@ -2169,7 +2169,7 @@
return len;
}
-#endif
+#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
static uint64_t //
wuffs_base__pixel_swizzler__xxxx__y(uint8_t* dst_ptr,
diff --git a/internal/cgen/data/data.go b/internal/cgen/data/data.go
index e0f889f..7a4c1ed 100644
--- a/internal/cgen/data/data.go
+++ b/internal/cgen/data/data.go
@@ -551,8 +551,8 @@
"" +
"// --------\n\nstatic uint64_t //\nwuffs_base__pixel_swizzler__squash_align4_bgr_565_888(uint8_t* dst_ptr,\n size_t dst_len,\n uint8_t* dst_palette_ptr,\n size_t dst_palette_len,\n const uint8_t* src_ptr,\n size_t src_len) {\n size_t len = (dst_len < src_len ? dst_len : src_len) / 4;\n uint8_t* d = dst_ptr;\n const uint8_t* s = src_ptr;\n\n size_t n = len;\n while (n--) {\n uint32_t argb = wuffs_base__load_u32le__no_bounds_check(s);\n uint32_t b5 = 0x1F & (argb >> (8 - 5));\n uint32_t g6 = 0x3F & (argb >> (16 - 6));\n uint32_t r5 = 0x1F & (argb >> (24 - 5));\n uint32_t alpha = argb & 0xFF000000;\n wuffs_base__store_u32le__no_bounds_check(\n d, alpha | (r5 << 11) | (g6 << 5) | (b5 << 0));\n s += 4;\n d += 4;\n }\n return len;\n}\n\nstatic " +
"uint64_t //\nwuffs_base__pixel_swizzler__swap_rgb_bgr(uint8_t* dst_ptr,\n size_t dst_len,\n uint8_t* dst_palette_ptr,\n size_t dst_palette_len,\n const uint8_t* src_ptr,\n size_t src_len) {\n size_t len = (dst_len < src_len ? dst_len : src_len) / 3;\n uint8_t* d = dst_ptr;\n const uint8_t* s = src_ptr;\n\n size_t n = len;\n while (n--) {\n uint8_t b0 = s[0];\n uint8_t b1 = s[1];\n uint8_t b2 = s[2];\n d[0] = b2;\n d[1] = b1;\n d[2] = b0;\n s += 3;\n d += 3;\n }\n return len;\n}\n\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n#if defined(__GNUC__)\n__attribute__((target(\"sse4.2\")))\n#endif\nstatic uint64_t //\nwuffs_base__pixel_swizzler__swap_rgbx_bgrx__sse128(uint8_t* dst_ptr,\n size_t dst_len,\n uint8_t* dst_palett" +
- "e_ptr,\n size_t dst_palette_len,\n const uint8_t* src_ptr,\n size_t src_len) {\n size_t len = (dst_len < src_len ? dst_len : src_len) / 4;\n uint8_t* d = dst_ptr;\n const uint8_t* s = src_ptr;\n size_t n = len;\n\n __m128i shuffle = _mm_set_epi8(0x0F, 0x0C, 0x0D, 0x0E, //\n 0x0B, 0x08, 0x09, 0x0A, //\n 0x07, 0x04, 0x05, 0x06, //\n 0x03, 0x00, 0x01, 0x02);\n\n while (n >= 4) {\n __m128i x;\n x = _mm_loadu_si128((const void*)s);\n x = _mm_shuffle_epi8(x, shuffle);\n _mm_storeu_si128((void*)d, x);\n\n s += 4 * 4;\n d += 4 * 4;\n n -= 4;\n }\n\n while (n--) {\n uint8_t b0 = s[0];\n uint8_t b1 = s[1];\n uint8_t b2 = s[2];\n uint8_t b3 = s[3];\n d[0] = b2;\n d[1] = b1;\n d[2] = b0;\n d[3] = b3;\n s += 4;\n d += 4;\n }\n return len;\n}\n#endif\n\nstati" +
- "c uint64_t //\nwuffs_base__pixel_swizzler__swap_rgbx_bgrx(uint8_t* dst_ptr,\n size_t dst_len,\n uint8_t* dst_palette_ptr,\n size_t dst_palette_len,\n const uint8_t* src_ptr,\n size_t src_len) {\n size_t len = (dst_len < src_len ? dst_len : src_len) / 4;\n uint8_t* d = dst_ptr;\n const uint8_t* s = src_ptr;\n\n size_t n = len;\n while (n--) {\n uint8_t b0 = s[0];\n uint8_t b1 = s[1];\n uint8_t b2 = s[2];\n uint8_t b3 = s[3];\n d[0] = b2;\n d[1] = b1;\n d[2] = b0;\n d[3] = b3;\n s += 4;\n d += 4;\n }\n return len;\n}\n\n" +
+ "e_ptr,\n size_t dst_palette_len,\n const uint8_t* src_ptr,\n size_t src_len) {\n size_t len = (dst_len < src_len ? dst_len : src_len) / 4;\n uint8_t* d = dst_ptr;\n const uint8_t* s = src_ptr;\n size_t n = len;\n\n __m128i shuffle = _mm_set_epi8(+0x0F, +0x0C, +0x0D, +0x0E, //\n +0x0B, +0x08, +0x09, +0x0A, //\n +0x07, +0x04, +0x05, +0x06, //\n +0x03, +0x00, +0x01, +0x02);\n\n while (n >= 4) {\n __m128i x;\n x = _mm_loadu_si128((const __m128i*)(const void*)s);\n x = _mm_shuffle_epi8(x, shuffle);\n _mm_storeu_si128((__m128i*)(void*)d, x);\n\n s += 4 * 4;\n d += 4 * 4;\n n -= 4;\n }\n\n while (n--) {\n uint8_t b0 = s[0];\n uint8_t b1 = s[1];\n uint8_t b2 = s[2];\n uint8_t b3 = s[3];\n d[0] = b2;\n d[1] = b1;\n d[2] = b0;\n d[3] = b3;\n s += 4;\n " +
+ " d += 4;\n }\n return len;\n}\n#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n\nstatic uint64_t //\nwuffs_base__pixel_swizzler__swap_rgbx_bgrx(uint8_t* dst_ptr,\n size_t dst_len,\n uint8_t* dst_palette_ptr,\n size_t dst_palette_len,\n const uint8_t* src_ptr,\n size_t src_len) {\n size_t len = (dst_len < src_len ? dst_len : src_len) / 4;\n uint8_t* d = dst_ptr;\n const uint8_t* s = src_ptr;\n\n size_t n = len;\n while (n--) {\n uint8_t b0 = s[0];\n uint8_t b1 = s[1];\n uint8_t b2 = s[2];\n uint8_t b3 = s[3];\n d[0] = b2;\n d[1] = b1;\n d[2] = b0;\n d[3] = b3;\n s += 4;\n d += 4;\n }\n return len;\n}\n\n" +
"" +
"// --------\n\nstatic uint64_t //\nwuffs_base__pixel_swizzler__squash_tight_4x8_4x16le(uint8_t* dst_ptr,\n size_t dst_len,\n uint8_t* dst_palette_ptr,\n size_t dst_palette_len,\n const uint8_t* src_ptr,\n size_t src_len) {\n size_t dst_len4 = dst_len / 4;\n size_t src_len8 = src_len / 8;\n size_t len = (dst_len4 < src_len8) ? dst_len4 : src_len8;\n uint8_t* d = dst_ptr;\n const uint8_t* s = src_ptr;\n\n size_t n = len;\n while (n >= 1) {\n wuffs_base__store_u32le__no_bounds_check(\n d + (0 * 4), wuffs_base__color_u64__as__color_u32(\n wuffs_base__load_u64le__no_bounds_check(s + (0 * 8))));\n\n s += 1 * 8;\n d += 1 * 4;\n n -= 1;\n }\n return len;\n}\n\n" +
"" +
@@ -595,8 +595,9 @@
"" +
"// --------\n\nstatic uint64_t //\nwuffs_base__pixel_swizzler__bgrw__bgr(uint8_t* dst_ptr,\n size_t dst_len,\n uint8_t* dst_palette_ptr,\n size_t dst_palette_len,\n const uint8_t* src_ptr,\n size_t src_len) {\n size_t dst_len4 = dst_len / 4;\n size_t src_len3 = src_len / 3;\n size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;\n uint8_t* d = dst_ptr;\n const uint8_t* s = src_ptr;\n size_t n = len;\n\n // TODO: unroll.\n\n while (n >= 1) {\n wuffs_base__store_u32le__no_bounds_check(\n d + (0 * 4),\n 0xFF000000 | wuffs_base__load_u24le__no_bounds_check(s + (0 * 3)));\n\n s += 1 * 3;\n d += 1 * 4;\n n -= 1;\n }\n\n return len;\n}\n\nstatic uint64_t //\nwuffs_base__pixel_swizzler__bgrw__bgrx(uint8_t* dst_ptr,\n size_t dst_len,\n uint8_t* dst_palet" +
"te_ptr,\n size_t dst_palette_len,\n const uint8_t* src_ptr,\n size_t src_len) {\n size_t dst_len4 = dst_len / 4;\n size_t src_len4 = src_len / 4;\n size_t len = (dst_len4 < src_len4) ? dst_len4 : src_len4;\n uint8_t* d = dst_ptr;\n const uint8_t* s = src_ptr;\n size_t n = len;\n\n // TODO: unroll.\n\n while (n >= 1) {\n wuffs_base__store_u32le__no_bounds_check(\n d + (0 * 4),\n 0xFF000000 | wuffs_base__load_u32le__no_bounds_check(s + (0 * 4)));\n\n s += 1 * 4;\n d += 1 * 4;\n n -= 1;\n }\n\n return len;\n}\n\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n#if defined(__GNUC__)\n__attribute__((target(\"sse4.2\")))\n#endif\nstatic uint64_t //\nwuffs_base__pixel_swizzler__bgrw__rgb__sse128(uint8_t* dst_ptr,\n size_t dst_len,\n uint8_t* dst_palette_ptr,\n size_t dst_palet" +
- "te_len,\n const uint8_t* src_ptr,\n size_t src_len) {\n size_t dst_len4 = dst_len / 4;\n size_t src_len3 = src_len / 3;\n size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;\n uint8_t* d = dst_ptr;\n const uint8_t* s = src_ptr;\n size_t n = len;\n\n __m128i shuffle = _mm_set_epi8(0x00, 0x09, 0x0A, 0x0B, //\n 0x00, 0x06, 0x07, 0x08, //\n 0x00, 0x03, 0x04, 0x05, //\n 0x00, 0x00, 0x01, 0x02);\n __m128i or = _mm_set_epi8(0xFF, 0x00, 0x00, 0x00, //\n 0xFF, 0x00, 0x00, 0x00, //\n 0xFF, 0x00, 0x00, 0x00, //\n 0xFF, 0x00, 0x00, 0x00);\n\n while (n >= 6) {\n __m128i x;\n x = _mm_loadu_si128((const void*)s);\n x = _mm_shuffle_epi8(x, shuffle);\n x = _mm_or_si128(x, or);\n _mm_storeu_si128((void*)d, x);\n\n s += 4 * 3;\n d += 4 * 4;\n n -= 4" +
- ";\n }\n\n while (n >= 1) {\n uint8_t b0 = s[0];\n uint8_t b1 = s[1];\n uint8_t b2 = s[2];\n d[0] = b2;\n d[1] = b1;\n d[2] = b0;\n d[3] = 0xFF;\n\n s += 1 * 3;\n d += 1 * 4;\n n -= 1;\n }\n\n return len;\n}\n#endif\n\nstatic uint64_t //\nwuffs_base__pixel_swizzler__bgrw__rgb(uint8_t* dst_ptr,\n size_t dst_len,\n uint8_t* dst_palette_ptr,\n size_t dst_palette_len,\n const uint8_t* src_ptr,\n size_t src_len) {\n size_t dst_len4 = dst_len / 4;\n size_t src_len3 = src_len / 3;\n size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;\n uint8_t* d = dst_ptr;\n const uint8_t* s = src_ptr;\n size_t n = len;\n\n while (n >= 1) {\n uint8_t b0 = s[0];\n uint8_t b1 = s[1];\n uint8_t b2 = s[2];\n d[0] = b2;\n d[1] = b1;\n d[2] = b0;\n d[3] = 0xFF;\n\n s += 1 * 3;\n d += 1 * 4;\n n -= 1;\n }\n\n return len;\n}\n\n" +
+ "te_len,\n const uint8_t* src_ptr,\n size_t src_len) {\n size_t dst_len4 = dst_len / 4;\n size_t src_len3 = src_len / 3;\n size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;\n uint8_t* d = dst_ptr;\n const uint8_t* s = src_ptr;\n size_t n = len;\n\n __m128i shuffle = _mm_set_epi8(+0x00, +0x09, +0x0A, +0x0B, //\n +0x00, +0x06, +0x07, +0x08, //\n +0x00, +0x03, +0x04, +0x05, //\n +0x00, +0x00, +0x01, +0x02);\n __m128i or_ff = _mm_set_epi8(-0x01, +0x00, +0x00, +0x00, //\n -0x01, +0x00, +0x00, +0x00, //\n -0x01, +0x00, +0x00, +0x00, //\n -0x01, +0x00, +0x00, +0x00);\n\n while (n >= 6) {\n __m128i x;\n x = _mm_loadu_si128((const __m128i*)(const void*)s);\n x = _mm_shuffle_epi8(x, shuffle);\n x = _mm_or_si128(x, or_ff);\n _mm_storeu_" +
+ "si128((__m128i*)(void*)d, x);\n\n s += 4 * 3;\n d += 4 * 4;\n n -= 4;\n }\n\n while (n >= 1) {\n uint8_t b0 = s[0];\n uint8_t b1 = s[1];\n uint8_t b2 = s[2];\n d[0] = b2;\n d[1] = b1;\n d[2] = b0;\n d[3] = 0xFF;\n\n s += 1 * 3;\n d += 1 * 4;\n n -= 1;\n }\n\n return len;\n}\n#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n\nstatic uint64_t //\nwuffs_base__pixel_swizzler__bgrw__rgb(uint8_t* dst_ptr,\n size_t dst_len,\n uint8_t* dst_palette_ptr,\n size_t dst_palette_len,\n const uint8_t* src_ptr,\n size_t src_len) {\n size_t dst_len4 = dst_len / 4;\n size_t src_len3 = src_len / 3;\n size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;\n uint8_t* d = dst_ptr;\n const uint8_t* s = src_ptr;\n size_t n = len;\n\n while (n >= 1) {\n uint8_t b0 = s[0];\n uint8_t b1 = s[1];\n uint8_t b2 = s[2];\n d[0] = b2;\n " +
+ " d[1] = b1;\n d[2] = b0;\n d[3] = 0xFF;\n\n s += 1 * 3;\n d += 1 * 4;\n n -= 1;\n }\n\n return len;\n}\n\n" +
"" +
"// --------\n\nstatic uint64_t //\nwuffs_base__pixel_swizzler__xxx__index__src(uint8_t* dst_ptr,\n size_t dst_len,\n uint8_t* dst_palette_ptr,\n size_t dst_palette_len,\n const uint8_t* src_ptr,\n size_t src_len) {\n if (dst_palette_len != 1024) {\n return 0;\n }\n size_t dst_len3 = dst_len / 3;\n size_t len = (dst_len3 < src_len) ? dst_len3 : src_len;\n uint8_t* d = dst_ptr;\n const uint8_t* s = src_ptr;\n size_t n = len;\n\n const size_t loop_unroll_count = 4;\n\n // The comparison in the while condition is \">\", not \">=\", because with\n // \">=\", the last 4-byte store could write past the end of the dst slice.\n //\n // Each 4-byte store writes one too many bytes, but a subsequent store\n // will overwrite that with the correct byte. There is always another\n // store, whether a 4-byte store in this loop" +
" or a 1-byte store in the\n // next loop.\n while (n > loop_unroll_count) {\n wuffs_base__store_u32le__no_bounds_check(\n d + (0 * 3), wuffs_base__load_u32le__no_bounds_check(\n dst_palette_ptr + ((size_t)s[0] * 4)));\n wuffs_base__store_u32le__no_bounds_check(\n d + (1 * 3), wuffs_base__load_u32le__no_bounds_check(\n dst_palette_ptr + ((size_t)s[1] * 4)));\n wuffs_base__store_u32le__no_bounds_check(\n d + (2 * 3), wuffs_base__load_u32le__no_bounds_check(\n dst_palette_ptr + ((size_t)s[2] * 4)));\n wuffs_base__store_u32le__no_bounds_check(\n d + (3 * 3), wuffs_base__load_u32le__no_bounds_check(\n dst_palette_ptr + ((size_t)s[3] * 4)));\n\n s += loop_unroll_count * 1;\n d += loop_unroll_count * 3;\n n -= loop_unroll_count;\n }\n\n while (n >= 1) {\n uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette_ptr +\n ((siz" +
@@ -609,8 +610,8 @@
"dst_palette_ptr + ((size_t)s[1] * 4)));\n wuffs_base__store_u32le__no_bounds_check(\n d + (2 * 4), wuffs_base__load_u32le__no_bounds_check(\n dst_palette_ptr + ((size_t)s[2] * 4)));\n wuffs_base__store_u32le__no_bounds_check(\n d + (3 * 4), wuffs_base__load_u32le__no_bounds_check(\n dst_palette_ptr + ((size_t)s[3] * 4)));\n\n s += loop_unroll_count * 1;\n d += loop_unroll_count * 4;\n n -= loop_unroll_count;\n }\n\n while (n >= 1) {\n wuffs_base__store_u32le__no_bounds_check(\n d + (0 * 4), wuffs_base__load_u32le__no_bounds_check(\n dst_palette_ptr + ((size_t)s[0] * 4)));\n\n s += 1 * 1;\n d += 1 * 4;\n n -= 1;\n }\n\n return len;\n}\n\nstatic uint64_t //\nwuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over(\n uint8_t* dst_ptr,\n size_t dst_len,\n uint8_t* dst_palette_ptr,\n size_t dst_palette_len,\n const uint8_t* src_ptr,\n size_t src_len) {\n if (dst_palette_len != 1024) {\n return 0;\n" +
" }\n size_t dst_len4 = dst_len / 4;\n size_t len = (dst_len4 < src_len) ? dst_len4 : src_len;\n uint8_t* d = dst_ptr;\n const uint8_t* s = src_ptr;\n size_t n = len;\n\n const size_t loop_unroll_count = 4;\n\n while (n >= loop_unroll_count) {\n uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette_ptr +\n ((size_t)s[0] * 4));\n if (s0) {\n wuffs_base__store_u32le__no_bounds_check(d + (0 * 4), s0);\n }\n uint32_t s1 = wuffs_base__load_u32le__no_bounds_check(dst_palette_ptr +\n ((size_t)s[1] * 4));\n if (s1) {\n wuffs_base__store_u32le__no_bounds_check(d + (1 * 4), s1);\n }\n uint32_t s2 = wuffs_base__load_u32le__no_bounds_check(dst_palette_ptr +\n ((size_t)s[2] * 4));\n if (s2) {\n wuffs_base__store_u32le__no_bounds_check(d + (2 * 4), s2);\n }\n uint32_t s3 = wuffs_base__load_u32le__no_bounds_check(dst_" +
"palette_ptr +\n ((size_t)s[3] * 4));\n if (s3) {\n wuffs_base__store_u32le__no_bounds_check(d + (3 * 4), s3);\n }\n\n s += loop_unroll_count * 1;\n d += loop_unroll_count * 4;\n n -= loop_unroll_count;\n }\n\n while (n >= 1) {\n uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette_ptr +\n ((size_t)s[0] * 4));\n if (s0) {\n wuffs_base__store_u32le__no_bounds_check(d + (0 * 4), s0);\n }\n\n s += 1 * 1;\n d += 1 * 4;\n n -= 1;\n }\n\n return len;\n}\n\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n#if defined(__GNUC__)\n__attribute__((target(\"sse4.2\")))\n#endif\nstatic uint64_t //\nwuffs_base__pixel_swizzler__xxxx__y__sse128(uint8_t* dst_ptr,\n size_t dst_len,\n uint8_t* dst_palette_ptr,\n size_t dst_palette_len,\n " +
- " const uint8_t* src_ptr,\n size_t src_len) {\n size_t dst_len4 = dst_len / 4;\n size_t len = (dst_len4 < src_len) ? dst_len4 : src_len;\n uint8_t* d = dst_ptr;\n const uint8_t* s = src_ptr;\n size_t n = len;\n\n __m128i shuffle = _mm_set_epi8(0x03, 0x03, 0x03, 0x03, //\n 0x02, 0x02, 0x02, 0x02, //\n 0x01, 0x01, 0x01, 0x01, //\n 0x00, 0x00, 0x00, 0x00);\n __m128i or = _mm_set_epi8(0xFF, 0x00, 0x00, 0x00, //\n 0xFF, 0x00, 0x00, 0x00, //\n 0xFF, 0x00, 0x00, 0x00, //\n 0xFF, 0x00, 0x00, 0x00);\n\n while (n >= 4) {\n __m128i x;\n x = _mm_cvtsi32_si128(wuffs_base__load_u32le__no_bounds_check(s));\n x = _mm_shuffle_epi8(x, shuffle);\n x = _mm_or_si128(x, or);\n _mm_storeu_si128((void*)d, x);\n\n s += 4 * 1;\n d += 4 * 4;\n n -= 4;\n }\n\n while (n >= 1) {\n wuffs_base__store_u32le_" +
- "_no_bounds_check(\n d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));\n\n s += 1 * 1;\n d += 1 * 4;\n n -= 1;\n }\n\n return len;\n}\n#endif\n\nstatic uint64_t //\nwuffs_base__pixel_swizzler__xxxx__y(uint8_t* dst_ptr,\n size_t dst_len,\n uint8_t* dst_palette_ptr,\n size_t dst_palette_len,\n const uint8_t* src_ptr,\n size_t src_len) {\n size_t dst_len4 = dst_len / 4;\n size_t len = (dst_len4 < src_len) ? dst_len4 : src_len;\n uint8_t* d = dst_ptr;\n const uint8_t* s = src_ptr;\n size_t n = len;\n\n while (n >= 1) {\n wuffs_base__store_u32le__no_bounds_check(\n d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));\n\n s += 1 * 1;\n d += 1 * 4;\n n -= 1;\n }\n\n return len;\n}\n\n" +
+ " const uint8_t* src_ptr,\n size_t src_len) {\n size_t dst_len4 = dst_len / 4;\n size_t len = (dst_len4 < src_len) ? dst_len4 : src_len;\n uint8_t* d = dst_ptr;\n const uint8_t* s = src_ptr;\n size_t n = len;\n\n __m128i shuffle = _mm_set_epi8(+0x03, +0x03, +0x03, +0x03, //\n +0x02, +0x02, +0x02, +0x02, //\n +0x01, +0x01, +0x01, +0x01, //\n +0x00, +0x00, +0x00, +0x00);\n __m128i or_ff = _mm_set_epi8(-0x01, +0x00, +0x00, +0x00, //\n -0x01, +0x00, +0x00, +0x00, //\n -0x01, +0x00, +0x00, +0x00, //\n -0x01, +0x00, +0x00, +0x00);\n\n while (n >= 4) {\n __m128i x;\n x = _mm_cvtsi32_si128((int)(wuffs_base__load_u32le__no_bounds_check(s)));\n x = _mm_shuffle_epi8(x, shuffle);\n x = _mm_or_si128(x, or_ff);\n _mm_storeu_si128((__m128i*)(void*)d, x);\n\n s += 4 * 1;\n d += 4 * 4;\n" +
+ " n -= 4;\n }\n\n while (n >= 1) {\n wuffs_base__store_u32le__no_bounds_check(\n d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));\n\n s += 1 * 1;\n d += 1 * 4;\n n -= 1;\n }\n\n return len;\n}\n#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n\nstatic uint64_t //\nwuffs_base__pixel_swizzler__xxxx__y(uint8_t* dst_ptr,\n size_t dst_len,\n uint8_t* dst_palette_ptr,\n size_t dst_palette_len,\n const uint8_t* src_ptr,\n size_t src_len) {\n size_t dst_len4 = dst_len / 4;\n size_t len = (dst_len4 < src_len) ? dst_len4 : src_len;\n uint8_t* d = dst_ptr;\n const uint8_t* s = src_ptr;\n size_t n = len;\n\n while (n >= 1) {\n wuffs_base__store_u32le__no_bounds_check(\n d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));\n\n s += 1 * 1;\n d += 1 * 4;\n n -= 1;\n }\n\n return len;\n}\n\n" +
"" +
"// --------\n\nstatic uint64_t //\nwuffs_base__pixel_swizzler__transparent_black_src(\n uint8_t* dst_ptr,\n size_t dst_len,\n uint8_t* dst_palette_ptr,\n size_t dst_palette_len,\n uint64_t num_pixels,\n uint32_t dst_pixfmt_bytes_per_pixel) {\n uint64_t n = ((uint64_t)dst_len) / dst_pixfmt_bytes_per_pixel;\n if (n > num_pixels) {\n n = num_pixels;\n }\n memset(dst_ptr, 0, ((size_t)(n * dst_pixfmt_bytes_per_pixel)));\n return n;\n}\n\nstatic uint64_t //\nwuffs_base__pixel_swizzler__transparent_black_src_over(\n uint8_t* dst_ptr,\n size_t dst_len,\n uint8_t* dst_palette_ptr,\n size_t dst_palette_len,\n uint64_t num_pixels,\n uint32_t dst_pixfmt_bytes_per_pixel) {\n uint64_t n = ((uint64_t)dst_len) / dst_pixfmt_bytes_per_pixel;\n if (n > num_pixels) {\n n = num_pixels;\n }\n return n;\n}\n\n" +
"" +
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index b1aaa53..77dbca4 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -14545,16 +14545,16 @@
const uint8_t* s = src_ptr;
size_t n = len;
- __m128i shuffle = _mm_set_epi8(0x0F, 0x0C, 0x0D, 0x0E, //
- 0x0B, 0x08, 0x09, 0x0A, //
- 0x07, 0x04, 0x05, 0x06, //
- 0x03, 0x00, 0x01, 0x02);
+ __m128i shuffle = _mm_set_epi8(+0x0F, +0x0C, +0x0D, +0x0E, //
+ +0x0B, +0x08, +0x09, +0x0A, //
+ +0x07, +0x04, +0x05, +0x06, //
+ +0x03, +0x00, +0x01, +0x02);
while (n >= 4) {
__m128i x;
- x = _mm_loadu_si128((const void*)s);
+ x = _mm_loadu_si128((const __m128i*)(const void*)s);
x = _mm_shuffle_epi8(x, shuffle);
- _mm_storeu_si128((void*)d, x);
+ _mm_storeu_si128((__m128i*)(void*)d, x);
s += 4 * 4;
d += 4 * 4;
@@ -14575,7 +14575,7 @@
}
return len;
}
-#endif
+#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
static uint64_t //
wuffs_base__pixel_swizzler__swap_rgbx_bgrx(uint8_t* dst_ptr,
@@ -15759,21 +15759,21 @@
const uint8_t* s = src_ptr;
size_t n = len;
- __m128i shuffle = _mm_set_epi8(0x00, 0x09, 0x0A, 0x0B, //
- 0x00, 0x06, 0x07, 0x08, //
- 0x00, 0x03, 0x04, 0x05, //
- 0x00, 0x00, 0x01, 0x02);
- __m128i or = _mm_set_epi8(0xFF, 0x00, 0x00, 0x00, //
- 0xFF, 0x00, 0x00, 0x00, //
- 0xFF, 0x00, 0x00, 0x00, //
- 0xFF, 0x00, 0x00, 0x00);
+ __m128i shuffle = _mm_set_epi8(+0x00, +0x09, +0x0A, +0x0B, //
+ +0x00, +0x06, +0x07, +0x08, //
+ +0x00, +0x03, +0x04, +0x05, //
+ +0x00, +0x00, +0x01, +0x02);
+ __m128i or_ff = _mm_set_epi8(-0x01, +0x00, +0x00, +0x00, //
+ -0x01, +0x00, +0x00, +0x00, //
+ -0x01, +0x00, +0x00, +0x00, //
+ -0x01, +0x00, +0x00, +0x00);
while (n >= 6) {
__m128i x;
- x = _mm_loadu_si128((const void*)s);
+ x = _mm_loadu_si128((const __m128i*)(const void*)s);
x = _mm_shuffle_epi8(x, shuffle);
- x = _mm_or_si128(x, or);
- _mm_storeu_si128((void*)d, x);
+ x = _mm_or_si128(x, or_ff);
+ _mm_storeu_si128((__m128i*)(void*)d, x);
s += 4 * 3;
d += 4 * 4;
@@ -15796,7 +15796,7 @@
return len;
}
-#endif
+#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
static uint64_t //
wuffs_base__pixel_swizzler__bgrw__rgb(uint8_t* dst_ptr,
@@ -16136,21 +16136,21 @@
const uint8_t* s = src_ptr;
size_t n = len;
- __m128i shuffle = _mm_set_epi8(0x03, 0x03, 0x03, 0x03, //
- 0x02, 0x02, 0x02, 0x02, //
- 0x01, 0x01, 0x01, 0x01, //
- 0x00, 0x00, 0x00, 0x00);
- __m128i or = _mm_set_epi8(0xFF, 0x00, 0x00, 0x00, //
- 0xFF, 0x00, 0x00, 0x00, //
- 0xFF, 0x00, 0x00, 0x00, //
- 0xFF, 0x00, 0x00, 0x00);
+ __m128i shuffle = _mm_set_epi8(+0x03, +0x03, +0x03, +0x03, //
+ +0x02, +0x02, +0x02, +0x02, //
+ +0x01, +0x01, +0x01, +0x01, //
+ +0x00, +0x00, +0x00, +0x00);
+ __m128i or_ff = _mm_set_epi8(-0x01, +0x00, +0x00, +0x00, //
+ -0x01, +0x00, +0x00, +0x00, //
+ -0x01, +0x00, +0x00, +0x00, //
+ -0x01, +0x00, +0x00, +0x00);
while (n >= 4) {
__m128i x;
- x = _mm_cvtsi32_si128(wuffs_base__load_u32le__no_bounds_check(s));
+ x = _mm_cvtsi32_si128((int)(wuffs_base__load_u32le__no_bounds_check(s)));
x = _mm_shuffle_epi8(x, shuffle);
- x = _mm_or_si128(x, or);
- _mm_storeu_si128((void*)d, x);
+ x = _mm_or_si128(x, or_ff);
+ _mm_storeu_si128((__m128i*)(void*)d, x);
s += 4 * 1;
d += 4 * 4;
@@ -16168,7 +16168,7 @@
return len;
}
-#endif
+#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
static uint64_t //
wuffs_base__pixel_swizzler__xxxx__y(uint8_t* dst_ptr,