pixconv: sse42-optimize swizzling RGB to RGBA
PNG (as a source pixel format) is RGB or RGBA order. Wuffs' std/png
benchmarks, like the other image formats, decode to BGRA order. *If*
those benchmarks are altered to target RGBA instead, we get:
name old speed new speed delta
wuffs_png_decode_image_40k_24bpp/clang14 279MB/s ± 0% 293MB/s ± 0% +5.11% (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/clang14 289MB/s ± 0% 303MB/s ± 0% +4.89% (p=0.008 n=5+5)
wuffs_png_decode_image_40k_24bpp/gcc12 311MB/s ± 0% 332MB/s ± 0% +6.84% (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/gcc12 325MB/s ± 0% 339MB/s ± 0% +4.49% (p=0.008 n=5+5)
When starting from 3-channel RGB, ending with RGBA is now as fast as
ending with BGRA.
Updates #141
diff --git a/internal/cgen/base/pixconv-submodule-regular.c b/internal/cgen/base/pixconv-submodule-regular.c
index 42f58d3..9832cd2 100644
--- a/internal/cgen/base/pixconv-submodule-regular.c
+++ b/internal/cgen/base/pixconv-submodule-regular.c
@@ -13,6 +13,14 @@
#if defined(WUFFS_BASE__CPU_ARCH__X86_FAMILY)
WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2")
static uint64_t //
+wuffs_private_impl__swizzle_bgrw__bgr__x86_sse42(uint8_t* dst_ptr,
+ size_t dst_len,
+ uint8_t* dst_palette_ptr,
+ size_t dst_palette_len,
+ const uint8_t* src_ptr,
+ size_t src_len);
+
+static uint64_t //
wuffs_private_impl__swizzle_bgrw__rgb__x86_sse42(uint8_t* dst_ptr,
size_t dst_len,
uint8_t* dst_palette_ptr,
@@ -3310,6 +3318,59 @@
#if defined(WUFFS_BASE__CPU_ARCH__X86_FAMILY)
WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2")
static uint64_t //
+wuffs_private_impl__swizzle_bgrw__bgr__x86_sse42(uint8_t* dst_ptr,
+ size_t dst_len,
+ uint8_t* dst_palette_ptr,
+ size_t dst_palette_len,
+ const uint8_t* src_ptr,
+ size_t src_len) {
+ size_t dst_len4 = dst_len / 4;
+ size_t src_len3 = src_len / 3;
+ size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;
+ uint8_t* d = dst_ptr;
+ const uint8_t* s = src_ptr;
+ size_t n = len;
+
+ __m128i shuffle = _mm_set_epi8(+0x00, +0x0B, +0x0A, +0x09, //
+ +0x00, +0x08, +0x07, +0x06, //
+ +0x00, +0x05, +0x04, +0x03, //
+ +0x00, +0x02, +0x01, +0x00);
+ __m128i or_ff = _mm_set_epi8(-0x01, +0x00, +0x00, +0x00, //
+ -0x01, +0x00, +0x00, +0x00, //
+ -0x01, +0x00, +0x00, +0x00, //
+ -0x01, +0x00, +0x00, +0x00);
+
+ while (n >= 6) {
+ __m128i x;
+ x = _mm_lddqu_si128((const __m128i*)(const void*)s);
+ x = _mm_shuffle_epi8(x, shuffle);
+ x = _mm_or_si128(x, or_ff);
+ _mm_storeu_si128((__m128i*)(void*)d, x);
+
+ s += 4 * 3;
+ d += 4 * 4;
+ n -= 4;
+ }
+
+ while (n >= 1) {
+ uint8_t b0 = s[0];
+ uint8_t b1 = s[1];
+ uint8_t b2 = s[2];
+ d[0] = b0;
+ d[1] = b1;
+ d[2] = b2;
+ d[3] = 0xFF;
+
+ s += 1 * 3;
+ d += 1 * 4;
+ n -= 1;
+ }
+
+ return len;
+}
+
+WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2")
+static uint64_t //
wuffs_private_impl__swizzle_bgrw__rgb__x86_sse42(uint8_t* dst_ptr,
size_t dst_len,
uint8_t* dst_palette_ptr,
@@ -4793,6 +4854,11 @@
case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:
case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:
case WUFFS_BASE__PIXEL_FORMAT__BGRX:
+#if defined(WUFFS_BASE__CPU_ARCH__X86_FAMILY)
+ if (wuffs_base__cpu_arch__have_x86_sse42()) {
+ return wuffs_private_impl__swizzle_bgrw__bgr__x86_sse42;
+ }
+#endif
return wuffs_private_impl__swizzle_bgrw__bgr;
case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE:
@@ -5167,6 +5233,11 @@
case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:
case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:
case WUFFS_BASE__PIXEL_FORMAT__RGBX:
+#if defined(WUFFS_BASE__CPU_ARCH__X86_FAMILY)
+ if (wuffs_base__cpu_arch__have_x86_sse42()) {
+ return wuffs_private_impl__swizzle_bgrw__bgr__x86_sse42;
+ }
+#endif
return wuffs_private_impl__swizzle_bgrw__bgr;
}
return NULL;
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 153086b..cac12c9 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -20954,6 +20954,14 @@
#if defined(WUFFS_BASE__CPU_ARCH__X86_FAMILY)
WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2")
static uint64_t //
+wuffs_private_impl__swizzle_bgrw__bgr__x86_sse42(uint8_t* dst_ptr,
+ size_t dst_len,
+ uint8_t* dst_palette_ptr,
+ size_t dst_palette_len,
+ const uint8_t* src_ptr,
+ size_t src_len);
+
+static uint64_t //
wuffs_private_impl__swizzle_bgrw__rgb__x86_sse42(uint8_t* dst_ptr,
size_t dst_len,
uint8_t* dst_palette_ptr,
@@ -24251,6 +24259,59 @@
#if defined(WUFFS_BASE__CPU_ARCH__X86_FAMILY)
WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2")
static uint64_t //
+wuffs_private_impl__swizzle_bgrw__bgr__x86_sse42(uint8_t* dst_ptr,
+ size_t dst_len,
+ uint8_t* dst_palette_ptr,
+ size_t dst_palette_len,
+ const uint8_t* src_ptr,
+ size_t src_len) {
+ size_t dst_len4 = dst_len / 4;
+ size_t src_len3 = src_len / 3;
+ size_t len = (dst_len4 < src_len3) ? dst_len4 : src_len3;
+ uint8_t* d = dst_ptr;
+ const uint8_t* s = src_ptr;
+ size_t n = len;
+
+ __m128i shuffle = _mm_set_epi8(+0x00, +0x0B, +0x0A, +0x09, //
+ +0x00, +0x08, +0x07, +0x06, //
+ +0x00, +0x05, +0x04, +0x03, //
+ +0x00, +0x02, +0x01, +0x00);
+ __m128i or_ff = _mm_set_epi8(-0x01, +0x00, +0x00, +0x00, //
+ -0x01, +0x00, +0x00, +0x00, //
+ -0x01, +0x00, +0x00, +0x00, //
+ -0x01, +0x00, +0x00, +0x00);
+
+ while (n >= 6) {
+ __m128i x;
+ x = _mm_lddqu_si128((const __m128i*)(const void*)s);
+ x = _mm_shuffle_epi8(x, shuffle);
+ x = _mm_or_si128(x, or_ff);
+ _mm_storeu_si128((__m128i*)(void*)d, x);
+
+ s += 4 * 3;
+ d += 4 * 4;
+ n -= 4;
+ }
+
+ while (n >= 1) {
+ uint8_t b0 = s[0];
+ uint8_t b1 = s[1];
+ uint8_t b2 = s[2];
+ d[0] = b0;
+ d[1] = b1;
+ d[2] = b2;
+ d[3] = 0xFF;
+
+ s += 1 * 3;
+ d += 1 * 4;
+ n -= 1;
+ }
+
+ return len;
+}
+
+WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2")
+static uint64_t //
wuffs_private_impl__swizzle_bgrw__rgb__x86_sse42(uint8_t* dst_ptr,
size_t dst_len,
uint8_t* dst_palette_ptr,
@@ -25734,6 +25795,11 @@
case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:
case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:
case WUFFS_BASE__PIXEL_FORMAT__BGRX:
+#if defined(WUFFS_BASE__CPU_ARCH__X86_FAMILY)
+ if (wuffs_base__cpu_arch__have_x86_sse42()) {
+ return wuffs_private_impl__swizzle_bgrw__bgr__x86_sse42;
+ }
+#endif
return wuffs_private_impl__swizzle_bgrw__bgr;
case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE:
@@ -26108,6 +26174,11 @@
case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:
case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:
case WUFFS_BASE__PIXEL_FORMAT__RGBX:
+#if defined(WUFFS_BASE__CPU_ARCH__X86_FAMILY)
+ if (wuffs_base__cpu_arch__have_x86_sse42()) {
+ return wuffs_private_impl__swizzle_bgrw__bgr__x86_sse42;
+ }
+#endif
return wuffs_private_impl__swizzle_bgrw__bgr;
}
return NULL;