Have std/png filter_1_distance_? use more SIMD
This commit will soon be followed by a rollback, but it is committed
anyway so that we can refer to these numbers in the git log.
wuffs_png_decode_filt_1_dist_3/clang9 1.82GB/s ± 0% 1.03GB/s ± 0% -43.49% (p=0.008 n=5+5)
wuffs_png_decode_filt_1_dist_4/clang9 6.02GB/s ± 0% 5.23GB/s ± 0% -13.05% (p=0.008 n=5+5)
wuffs_png_decode_filt_1_dist_3/gcc10 1.82GB/s ± 1% 1.03GB/s ± 0% -43.36% (p=0.008 n=5+5)
wuffs_png_decode_filt_1_dist_4/gcc10 5.42GB/s ± 0% 5.13GB/s ± 1% -5.39% (p=0.008 n=5+5)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index a167986..48b08bf 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -30290,6 +30290,13 @@
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
static wuffs_base__empty_struct
+wuffs_png__decoder__filter_1_distance_3_sse42(
+ wuffs_png__decoder* self,
+ wuffs_base__slice_u8 a_curr);
+#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+static wuffs_base__empty_struct
wuffs_png__decoder__filter_1_distance_4_sse42(
wuffs_png__decoder* self,
wuffs_base__slice_u8 a_curr);
@@ -31142,6 +31149,55 @@
return wuffs_base__make_empty_struct();
}
+// -------- func png.decoder.filter_1_distance_3_sse42
+
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+#if defined(__GNUC__)
+__attribute__((target("sse4.2")))
+#endif
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_1_distance_3_sse42(
+ wuffs_png__decoder* self,
+ wuffs_base__slice_u8 a_curr) {
+ wuffs_base__slice_u8 v_c = {0};
+ __m128i v_x128 = {0};
+ __m128i v_i128 = {0};
+ __m128i v_j128 = {0};
+ __m128i v_k128 = {0};
+ __m128i v_a128 = {0};
+
+ {
+ wuffs_base__slice_u8 i_slice_c = a_curr;
+ v_c.ptr = i_slice_c.ptr;
+ v_c.len = 16;
+ uint8_t* i_end0_c = v_c.ptr + wuffs_base__iterate_total_advance((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)), 16, 12);
+ while (v_c.ptr < i_end0_c) {
+ v_x128 = _mm_lddqu_si128((const __m128i*)(const void*)(v_c.ptr));
+ v_i128 = _mm_add_epi8(v_x128, v_a128);
+ v_j128 = _mm_slli_si128(v_i128, (int32_t)(3));
+ v_j128 = _mm_add_epi8(v_j128, v_i128);
+ v_k128 = _mm_slli_si128(v_j128, (int32_t)(6));
+ v_k128 = _mm_add_epi8(v_k128, v_j128);
+ v_k128 = _mm_blend_epi16(v_k128, v_x128, (int32_t)(192));
+ _mm_storeu_si128((__m128i*)(void*)(v_c.ptr), v_k128);
+ v_a128 = _mm_srli_si128(_mm_slli_si128(v_k128, (int32_t)(4)), (int32_t)(13));
+ v_c.ptr += 12;
+ }
+ v_c.len = 3;
+ uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3);
+ while (v_c.ptr < i_end1_c) {
+ v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u24le__no_bounds_check(v_c.ptr)));
+ v_x128 = _mm_add_epi8(v_x128, v_a128);
+ v_a128 = v_x128;
+ wuffs_base__poke_u24le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+ v_c.ptr += 3;
+ }
+ v_c.len = 0;
+ }
+ return wuffs_base__make_empty_struct();
+}
+#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+
// -------- func png.decoder.filter_1_distance_4_sse42
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
@@ -31154,24 +31210,26 @@
wuffs_base__slice_u8 a_curr) {
wuffs_base__slice_u8 v_c = {0};
__m128i v_x128 = {0};
+ __m128i v_i128 = {0};
+ __m128i v_j128 = {0};
+ __m128i v_k128 = {0};
__m128i v_a128 = {0};
{
wuffs_base__slice_u8 i_slice_c = a_curr;
v_c.ptr = i_slice_c.ptr;
- v_c.len = 4;
- uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 8) * 8);
+ v_c.len = 16;
+ uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 16) * 16);
while (v_c.ptr < i_end0_c) {
- v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
- v_x128 = _mm_add_epi8(v_x128, v_a128);
- v_a128 = v_x128;
- wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
- v_c.ptr += 4;
- v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
- v_x128 = _mm_add_epi8(v_x128, v_a128);
- v_a128 = v_x128;
- wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
- v_c.ptr += 4;
+ v_x128 = _mm_lddqu_si128((const __m128i*)(const void*)(v_c.ptr));
+ v_i128 = _mm_add_epi8(v_x128, v_a128);
+ v_j128 = _mm_slli_si128(v_i128, (int32_t)(4));
+ v_j128 = _mm_add_epi8(v_j128, v_i128);
+ v_k128 = _mm_slli_si128(v_j128, (int32_t)(8));
+ v_k128 = _mm_add_epi8(v_k128, v_j128);
+ _mm_storeu_si128((__m128i*)(void*)(v_c.ptr), v_k128);
+ v_a128 = _mm_srli_si128(v_k128, (int32_t)(12));
+ v_c.ptr += 16;
}
v_c.len = 4;
uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
@@ -32134,6 +32192,9 @@
wuffs_png__decoder* self) {
if (self->private_impl.f_filter_distance == 3) {
self->private_impl.choosy_filter_1 = (
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+ wuffs_base__cpu_arch__have_sse42() ? &wuffs_png__decoder__filter_1_distance_3_sse42 :
+#endif
&wuffs_png__decoder__filter_1_distance_3_fallback);
self->private_impl.choosy_filter_3 = (
&wuffs_png__decoder__filter_3_distance_3_fallback);
diff --git a/std/png/decode_filter_sse42.wuffs b/std/png/decode_filter_sse42.wuffs
index e96a304..d63b562 100644
--- a/std/png/decode_filter_sse42.wuffs
+++ b/std/png/decode_filter_sse42.wuffs
@@ -16,36 +16,95 @@
// Filter 1: Sub.
-// This (filter = 1, distance = 3) implementation doesn't actually bench faster
-// than the non-SIMD one.
-//
-// pri func decoder.filter_1_distance_3_sse42!(curr: slice base.u8),
-// choose cpu_arch >= x86_sse42,
-// {
-// var c : slice base.u8
-// var x128 : base.x86_m128i
-// var a128 : base.x86_m128i
-//
-// iterate (c = args.curr)(length: 4, advance: 3, unroll: 2) {
-// x128.load_u32!(a: c.peek_u32le())
-// x128 = x128._mm_add_epi8(b: a128)
-// a128 = x128
-// c.poke_u24le!(a: x128.truncate_u32())
-// } else (length: 3, advance: 3, unroll: 1) {
-// x128.load_u32!(a: c.peek_u24le_as_u32())
-// x128 = x128._mm_add_epi8(b: a128)
-// c.poke_u24le!(a: x128.truncate_u32())
-// }
-// }
+pri func decoder.filter_1_distance_3_sse42!(curr: slice base.u8),
+ choose cpu_arch >= x86_sse42,
+{
+ var c : slice base.u8
+ var x128 : base.x86_m128i
+ var i128 : base.x86_m128i
+ var j128 : base.x86_m128i
+ var k128 : base.x86_m128i
+ var a128 : base.x86_m128i
+
+ iterate (c = args.curr)(length: 16, advance: 12, unroll: 1) {
+ // For distance_3, we only use the first 12 bytes (96 bits) of the
+ // 128-bit registers. The final 4 bytes (32 bits) are usually junk:
+ // - [R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 ; junkC junkD junkE junkF]
+ // For x128, the final bytes are the upcoming pixels:
+ // - [R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 ; R4 G4 B4 R5]
+ // For a128, the final bytes are zeroes.
+
+ // x128 = [x0, x1, x2, x3; upcoming]
+ x128.load_slice128!(a: c)
+
+ // i128 = [a+x0, x1, x2, x3; junk]
+ i128 = x128._mm_add_epi8(b: a128)
+
+ // j128 = [0, a+x0, x1, x2; junk]
+ j128 = i128._mm_slli_si128(imm8: 3)
+
+ // j128 = [a+x0, a+x0+x1, x1+x2, x2+x3; junk]
+ j128 = j128._mm_add_epi8(b: i128)
+
+ // k128 = [0, 0, a+x0, a+x0+x1; junk]
+ k128 = j128._mm_slli_si128(imm8: 6)
+
+ // k128 = [a+x0, a+x0+x1, a+x0+x1+x2, a+x0+x1+x2+x3; junk]
+ k128 = k128._mm_add_epi8(b: j128)
+
+ // k128 = [a+x0, a+x0+x1, a+x0+x1+x2, a+x0+x1+x2+x3; upcoming]
+ k128 = k128._mm_blend_epi16(b: x128, imm8: 0xC0)
+
+ // Store.
+ k128.store_slice128!(a: c)
+
+ // a128 = [a+x0+x1+x2+x3, 0, 0, 0; zeroes]
+ a128 = k128._mm_slli_si128(imm8: 4)._mm_srli_si128(imm8: 13)
+
+ } else (length: 3, advance: 3, unroll: 1) {
+ x128.load_u32!(a: c.peek_u24le_as_u32())
+ x128 = x128._mm_add_epi8(b: a128)
+ a128 = x128
+ c.poke_u24le!(a: x128.truncate_u32())
+ }
+}
pri func decoder.filter_1_distance_4_sse42!(curr: slice base.u8),
choose cpu_arch >= x86_sse42,
{
var c : slice base.u8
var x128 : base.x86_m128i
+ var i128 : base.x86_m128i
+ var j128 : base.x86_m128i
+ var k128 : base.x86_m128i
var a128 : base.x86_m128i
- iterate (c = args.curr)(length: 4, advance: 4, unroll: 2) {
+ iterate (c = args.curr)(length: 16, advance: 16, unroll: 1) {
+ // x128 = [x0, x1, x2, x3]
+ x128.load_slice128!(a: c)
+
+ // i128 = [a+x0, x1, x2, x3]
+ i128 = x128._mm_add_epi8(b: a128)
+
+ // j128 = [0, a+x0, x1, x2]
+ j128 = i128._mm_slli_si128(imm8: 4)
+
+ // j128 = [a+x0, a+x0+x1, x1+x2, x2+x3]
+ j128 = j128._mm_add_epi8(b: i128)
+
+ // k128 = [0, 0, a+x0, a+x0+x1]
+ k128 = j128._mm_slli_si128(imm8: 8)
+
+ // k128 = [a+x0, a+x0+x1, a+x0+x1+x2, a+x0+x1+x2+x3]
+ k128 = k128._mm_add_epi8(b: j128)
+
+ // Store.
+ k128.store_slice128!(a: c)
+
+ // a128 = [a+x0+x1+x2+x3, 0, 0, 0]
+ a128 = k128._mm_srli_si128(imm8: 12)
+
+ } else (length: 4, advance: 4, unroll: 1) {
x128.load_u32!(a: c.peek_u32le())
x128 = x128._mm_add_epi8(b: a128)
a128 = x128
diff --git a/std/png/decode_png.wuffs b/std/png/decode_png.wuffs
index d2f0cb2..e261b0c 100644
--- a/std/png/decode_png.wuffs
+++ b/std/png/decode_png.wuffs
@@ -310,7 +310,9 @@
// Filter 0 is a no-op. Filter 2, the up filter, should already vectorize
// easily by a good optimizing C compiler.
if this.filter_distance == 3 {
- choose filter_1 = [filter_1_distance_3_fallback]
+ choose filter_1 = [
+ filter_1_distance_3_sse42,
+ filter_1_distance_3_fallback]
choose filter_3 = [filter_3_distance_3_fallback]
choose filter_4 = [
filter_4_distance_3_sse42,