Roll back 'png filter_1_distance_? use more SIMD' See the previous commit for the rationale.
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c index 48b08bf..a167986 100644 --- a/release/c/wuffs-unsupported-snapshot.c +++ b/release/c/wuffs-unsupported-snapshot.c
@@ -30290,13 +30290,6 @@ #if defined(WUFFS_BASE__CPU_ARCH__X86_64) static wuffs_base__empty_struct -wuffs_png__decoder__filter_1_distance_3_sse42( - wuffs_png__decoder* self, - wuffs_base__slice_u8 a_curr); -#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64) - -#if defined(WUFFS_BASE__CPU_ARCH__X86_64) -static wuffs_base__empty_struct wuffs_png__decoder__filter_1_distance_4_sse42( wuffs_png__decoder* self, wuffs_base__slice_u8 a_curr); @@ -31149,55 +31142,6 @@ return wuffs_base__make_empty_struct(); } -// -------- func png.decoder.filter_1_distance_3_sse42 - -#if defined(WUFFS_BASE__CPU_ARCH__X86_64) -#if defined(__GNUC__) -__attribute__((target("sse4.2"))) -#endif -static wuffs_base__empty_struct -wuffs_png__decoder__filter_1_distance_3_sse42( - wuffs_png__decoder* self, - wuffs_base__slice_u8 a_curr) { - wuffs_base__slice_u8 v_c = {0}; - __m128i v_x128 = {0}; - __m128i v_i128 = {0}; - __m128i v_j128 = {0}; - __m128i v_k128 = {0}; - __m128i v_a128 = {0}; - - { - wuffs_base__slice_u8 i_slice_c = a_curr; - v_c.ptr = i_slice_c.ptr; - v_c.len = 16; - uint8_t* i_end0_c = v_c.ptr + wuffs_base__iterate_total_advance((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)), 16, 12); - while (v_c.ptr < i_end0_c) { - v_x128 = _mm_lddqu_si128((const __m128i*)(const void*)(v_c.ptr)); - v_i128 = _mm_add_epi8(v_x128, v_a128); - v_j128 = _mm_slli_si128(v_i128, (int32_t)(3)); - v_j128 = _mm_add_epi8(v_j128, v_i128); - v_k128 = _mm_slli_si128(v_j128, (int32_t)(6)); - v_k128 = _mm_add_epi8(v_k128, v_j128); - v_k128 = _mm_blend_epi16(v_k128, v_x128, (int32_t)(192)); - _mm_storeu_si128((__m128i*)(void*)(v_c.ptr), v_k128); - v_a128 = _mm_srli_si128(_mm_slli_si128(v_k128, (int32_t)(4)), (int32_t)(13)); - v_c.ptr += 12; - } - v_c.len = 3; - uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3); - while (v_c.ptr < i_end1_c) { - v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u24le__no_bounds_check(v_c.ptr))); - v_x128 = _mm_add_epi8(v_x128, v_a128); - v_a128 = v_x128; - wuffs_base__poke_u24le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128)))); - v_c.ptr += 3; - } - v_c.len = 0; - } - return wuffs_base__make_empty_struct(); -} -#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64) - // -------- func png.decoder.filter_1_distance_4_sse42 #if defined(WUFFS_BASE__CPU_ARCH__X86_64) @@ -31210,26 +31154,24 @@ wuffs_base__slice_u8 a_curr) { wuffs_base__slice_u8 v_c = {0}; __m128i v_x128 = {0}; - __m128i v_i128 = {0}; - __m128i v_j128 = {0}; - __m128i v_k128 = {0}; __m128i v_a128 = {0}; { wuffs_base__slice_u8 i_slice_c = a_curr; v_c.ptr = i_slice_c.ptr; - v_c.len = 16; - uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 16) * 16); + v_c.len = 4; + uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 8) * 8); while (v_c.ptr < i_end0_c) { - v_x128 = _mm_lddqu_si128((const __m128i*)(const void*)(v_c.ptr)); - v_i128 = _mm_add_epi8(v_x128, v_a128); - v_j128 = _mm_slli_si128(v_i128, (int32_t)(4)); - v_j128 = _mm_add_epi8(v_j128, v_i128); - v_k128 = _mm_slli_si128(v_j128, (int32_t)(8)); - v_k128 = _mm_add_epi8(v_k128, v_j128); - _mm_storeu_si128((__m128i*)(void*)(v_c.ptr), v_k128); - v_a128 = _mm_srli_si128(v_k128, (int32_t)(12)); - v_c.ptr += 16; + v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr))); + v_x128 = _mm_add_epi8(v_x128, v_a128); + v_a128 = v_x128; + wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128)))); + v_c.ptr += 4; + v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr))); + v_x128 = _mm_add_epi8(v_x128, v_a128); + v_a128 = v_x128; + wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128)))); + v_c.ptr += 4; } v_c.len = 4; uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4); @@ -32192,9 +32134,6 @@ wuffs_png__decoder* self) { if (self->private_impl.f_filter_distance == 3) { self->private_impl.choosy_filter_1 = ( -#if defined(WUFFS_BASE__CPU_ARCH__X86_64) - wuffs_base__cpu_arch__have_sse42() ? &wuffs_png__decoder__filter_1_distance_3_sse42 : -#endif &wuffs_png__decoder__filter_1_distance_3_fallback); self->private_impl.choosy_filter_3 = ( &wuffs_png__decoder__filter_3_distance_3_fallback);
diff --git a/std/png/decode_filter_sse42.wuffs b/std/png/decode_filter_sse42.wuffs index d63b562..e96a304 100644 --- a/std/png/decode_filter_sse42.wuffs +++ b/std/png/decode_filter_sse42.wuffs
@@ -16,95 +16,36 @@ // Filter 1: Sub. -pri func decoder.filter_1_distance_3_sse42!(curr: slice base.u8), - choose cpu_arch >= x86_sse42, -{ - var c : slice base.u8 - var x128 : base.x86_m128i - var i128 : base.x86_m128i - var j128 : base.x86_m128i - var k128 : base.x86_m128i - var a128 : base.x86_m128i - - iterate (c = args.curr)(length: 16, advance: 12, unroll: 1) { - // For distance_3, we only use the first 12 bytes (96 bits) of the - // 128-bit registers. The final 4 bytes (32 bits) are usually junk: - // - [R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 ; junkC junkD junkE junkF] - // For x128, the final bytes are the upcoming pixels: - // - [R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 ; R4 G4 B4 R5] - // For a128, the final bytes are zeroes. - - // x128 = [x0, x1, x2, x3; upcoming] - x128.load_slice128!(a: c) - - // i128 = [a+x0, x1, x2, x3; junk] - i128 = x128._mm_add_epi8(b: a128) - - // j128 = [0, a+x0, x1, x2; junk] - j128 = i128._mm_slli_si128(imm8: 3) - - // j128 = [a+x0, a+x0+x1, x1+x2, x2+x3; junk] - j128 = j128._mm_add_epi8(b: i128) - - // k128 = [0, 0, a+x0, a+x0+x1; junk] - k128 = j128._mm_slli_si128(imm8: 6) - - // k128 = [a+x0, a+x0+x1, a+x0+x1+x2, a+x0+x1+x2+x3; junk] - k128 = k128._mm_add_epi8(b: j128) - - // k128 = [a+x0, a+x0+x1, a+x0+x1+x2, a+x0+x1+x2+x3; upcoming] - k128 = k128._mm_blend_epi16(b: x128, imm8: 0xC0) - - // Store. - k128.store_slice128!(a: c) - - // a128 = [a+x0+x1+x2+x3, 0, 0, 0; zeroes] - a128 = k128._mm_slli_si128(imm8: 4)._mm_srli_si128(imm8: 13) - - } else (length: 3, advance: 3, unroll: 1) { - x128.load_u32!(a: c.peek_u24le_as_u32()) - x128 = x128._mm_add_epi8(b: a128) - a128 = x128 - c.poke_u24le!(a: x128.truncate_u32()) - } -} +// This (filter = 1, distance = 3) implementation doesn't actually bench faster +// than the non-SIMD one. +// +// pri func decoder.filter_1_distance_3_sse42!(curr: slice base.u8), +// choose cpu_arch >= x86_sse42, +// { +// var c : slice base.u8 +// var x128 : base.x86_m128i +// var a128 : base.x86_m128i +// +// iterate (c = args.curr)(length: 4, advance: 3, unroll: 2) { +// x128.load_u32!(a: c.peek_u32le()) +// x128 = x128._mm_add_epi8(b: a128) +// a128 = x128 +// c.poke_u24le!(a: x128.truncate_u32()) +// } else (length: 3, advance: 3, unroll: 1) { +// x128.load_u32!(a: c.peek_u24le_as_u32()) +// x128 = x128._mm_add_epi8(b: a128) +// c.poke_u24le!(a: x128.truncate_u32()) +// } +// } pri func decoder.filter_1_distance_4_sse42!(curr: slice base.u8), choose cpu_arch >= x86_sse42, { var c : slice base.u8 var x128 : base.x86_m128i - var i128 : base.x86_m128i - var j128 : base.x86_m128i - var k128 : base.x86_m128i var a128 : base.x86_m128i - iterate (c = args.curr)(length: 16, advance: 16, unroll: 1) { - // x128 = [x0, x1, x2, x3] - x128.load_slice128!(a: c) - - // i128 = [a+x0, x1, x2, x3] - i128 = x128._mm_add_epi8(b: a128) - - // j128 = [0, a+x0, x1, x2] - j128 = i128._mm_slli_si128(imm8: 4) - - // j128 = [a+x0, a+x0+x1, x1+x2, x2+x3] - j128 = j128._mm_add_epi8(b: i128) - - // k128 = [0, 0, a+x0, a+x0+x1] - k128 = j128._mm_slli_si128(imm8: 8) - - // k128 = [a+x0, a+x0+x1, a+x0+x1+x2, a+x0+x1+x2+x3] - k128 = k128._mm_add_epi8(b: j128) - - // Store. - k128.store_slice128!(a: c) - - // a128 = [a+x0+x1+x2+x3, 0, 0, 0] - a128 = k128._mm_srli_si128(imm8: 12) - - } else (length: 4, advance: 4, unroll: 1) { + iterate (c = args.curr)(length: 4, advance: 4, unroll: 2) { x128.load_u32!(a: c.peek_u32le()) x128 = x128._mm_add_epi8(b: a128) a128 = x128
diff --git a/std/png/decode_png.wuffs b/std/png/decode_png.wuffs index e261b0c..d2f0cb2 100644 --- a/std/png/decode_png.wuffs +++ b/std/png/decode_png.wuffs
@@ -310,9 +310,7 @@ // Filter 0 is a no-op. Filter 2, the up filter, should already vectorize // easily by a good optimizing C compiler. if this.filter_distance == 3 { - choose filter_1 = [ - filter_1_distance_3_sse42, - filter_1_distance_3_fallback] + choose filter_1 = [filter_1_distance_3_fallback] choose filter_3 = [filter_3_distance_3_fallback] choose filter_4 = [ filter_4_distance_3_sse42,