Roll back 'png filter_1_distance_? use more SIMD'
See the previous commit for the rationale.
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 48b08bf..a167986 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -30290,13 +30290,6 @@
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
static wuffs_base__empty_struct
-wuffs_png__decoder__filter_1_distance_3_sse42(
- wuffs_png__decoder* self,
- wuffs_base__slice_u8 a_curr);
-#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
-
-#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-static wuffs_base__empty_struct
wuffs_png__decoder__filter_1_distance_4_sse42(
wuffs_png__decoder* self,
wuffs_base__slice_u8 a_curr);
@@ -31149,55 +31142,6 @@
return wuffs_base__make_empty_struct();
}
-// -------- func png.decoder.filter_1_distance_3_sse42
-
-#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-#if defined(__GNUC__)
-__attribute__((target("sse4.2")))
-#endif
-static wuffs_base__empty_struct
-wuffs_png__decoder__filter_1_distance_3_sse42(
- wuffs_png__decoder* self,
- wuffs_base__slice_u8 a_curr) {
- wuffs_base__slice_u8 v_c = {0};
- __m128i v_x128 = {0};
- __m128i v_i128 = {0};
- __m128i v_j128 = {0};
- __m128i v_k128 = {0};
- __m128i v_a128 = {0};
-
- {
- wuffs_base__slice_u8 i_slice_c = a_curr;
- v_c.ptr = i_slice_c.ptr;
- v_c.len = 16;
- uint8_t* i_end0_c = v_c.ptr + wuffs_base__iterate_total_advance((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)), 16, 12);
- while (v_c.ptr < i_end0_c) {
- v_x128 = _mm_lddqu_si128((const __m128i*)(const void*)(v_c.ptr));
- v_i128 = _mm_add_epi8(v_x128, v_a128);
- v_j128 = _mm_slli_si128(v_i128, (int32_t)(3));
- v_j128 = _mm_add_epi8(v_j128, v_i128);
- v_k128 = _mm_slli_si128(v_j128, (int32_t)(6));
- v_k128 = _mm_add_epi8(v_k128, v_j128);
- v_k128 = _mm_blend_epi16(v_k128, v_x128, (int32_t)(192));
- _mm_storeu_si128((__m128i*)(void*)(v_c.ptr), v_k128);
- v_a128 = _mm_srli_si128(_mm_slli_si128(v_k128, (int32_t)(4)), (int32_t)(13));
- v_c.ptr += 12;
- }
- v_c.len = 3;
- uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3);
- while (v_c.ptr < i_end1_c) {
- v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u24le__no_bounds_check(v_c.ptr)));
- v_x128 = _mm_add_epi8(v_x128, v_a128);
- v_a128 = v_x128;
- wuffs_base__poke_u24le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
- v_c.ptr += 3;
- }
- v_c.len = 0;
- }
- return wuffs_base__make_empty_struct();
-}
-#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
-
// -------- func png.decoder.filter_1_distance_4_sse42
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
@@ -31210,26 +31154,24 @@
wuffs_base__slice_u8 a_curr) {
wuffs_base__slice_u8 v_c = {0};
__m128i v_x128 = {0};
- __m128i v_i128 = {0};
- __m128i v_j128 = {0};
- __m128i v_k128 = {0};
__m128i v_a128 = {0};
{
wuffs_base__slice_u8 i_slice_c = a_curr;
v_c.ptr = i_slice_c.ptr;
- v_c.len = 16;
- uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 16) * 16);
+ v_c.len = 4;
+ uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 8) * 8);
while (v_c.ptr < i_end0_c) {
- v_x128 = _mm_lddqu_si128((const __m128i*)(const void*)(v_c.ptr));
- v_i128 = _mm_add_epi8(v_x128, v_a128);
- v_j128 = _mm_slli_si128(v_i128, (int32_t)(4));
- v_j128 = _mm_add_epi8(v_j128, v_i128);
- v_k128 = _mm_slli_si128(v_j128, (int32_t)(8));
- v_k128 = _mm_add_epi8(v_k128, v_j128);
- _mm_storeu_si128((__m128i*)(void*)(v_c.ptr), v_k128);
- v_a128 = _mm_srli_si128(v_k128, (int32_t)(12));
- v_c.ptr += 16;
+ v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_x128 = _mm_add_epi8(v_x128, v_a128);
+ v_a128 = v_x128;
+ wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+ v_c.ptr += 4;
+ v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_x128 = _mm_add_epi8(v_x128, v_a128);
+ v_a128 = v_x128;
+ wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+ v_c.ptr += 4;
}
v_c.len = 4;
uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
@@ -32192,9 +32134,6 @@
wuffs_png__decoder* self) {
if (self->private_impl.f_filter_distance == 3) {
self->private_impl.choosy_filter_1 = (
-#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
- wuffs_base__cpu_arch__have_sse42() ? &wuffs_png__decoder__filter_1_distance_3_sse42 :
-#endif
&wuffs_png__decoder__filter_1_distance_3_fallback);
self->private_impl.choosy_filter_3 = (
&wuffs_png__decoder__filter_3_distance_3_fallback);
diff --git a/std/png/decode_filter_sse42.wuffs b/std/png/decode_filter_sse42.wuffs
index d63b562..e96a304 100644
--- a/std/png/decode_filter_sse42.wuffs
+++ b/std/png/decode_filter_sse42.wuffs
@@ -16,95 +16,36 @@
// Filter 1: Sub.
-pri func decoder.filter_1_distance_3_sse42!(curr: slice base.u8),
- choose cpu_arch >= x86_sse42,
-{
- var c : slice base.u8
- var x128 : base.x86_m128i
- var i128 : base.x86_m128i
- var j128 : base.x86_m128i
- var k128 : base.x86_m128i
- var a128 : base.x86_m128i
-
- iterate (c = args.curr)(length: 16, advance: 12, unroll: 1) {
- // For distance_3, we only use the first 12 bytes (96 bits) of the
- // 128-bit registers. The final 4 bytes (32 bits) are usually junk:
- // - [R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 ; junkC junkD junkE junkF]
- // For x128, the final bytes are the upcoming pixels:
- // - [R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 ; R4 G4 B4 R5]
- // For a128, the final bytes are zeroes.
-
- // x128 = [x0, x1, x2, x3; upcoming]
- x128.load_slice128!(a: c)
-
- // i128 = [a+x0, x1, x2, x3; junk]
- i128 = x128._mm_add_epi8(b: a128)
-
- // j128 = [0, a+x0, x1, x2; junk]
- j128 = i128._mm_slli_si128(imm8: 3)
-
- // j128 = [a+x0, a+x0+x1, x1+x2, x2+x3; junk]
- j128 = j128._mm_add_epi8(b: i128)
-
- // k128 = [0, 0, a+x0, a+x0+x1; junk]
- k128 = j128._mm_slli_si128(imm8: 6)
-
- // k128 = [a+x0, a+x0+x1, a+x0+x1+x2, a+x0+x1+x2+x3; junk]
- k128 = k128._mm_add_epi8(b: j128)
-
- // k128 = [a+x0, a+x0+x1, a+x0+x1+x2, a+x0+x1+x2+x3; upcoming]
- k128 = k128._mm_blend_epi16(b: x128, imm8: 0xC0)
-
- // Store.
- k128.store_slice128!(a: c)
-
- // a128 = [a+x0+x1+x2+x3, 0, 0, 0; zeroes]
- a128 = k128._mm_slli_si128(imm8: 4)._mm_srli_si128(imm8: 13)
-
- } else (length: 3, advance: 3, unroll: 1) {
- x128.load_u32!(a: c.peek_u24le_as_u32())
- x128 = x128._mm_add_epi8(b: a128)
- a128 = x128
- c.poke_u24le!(a: x128.truncate_u32())
- }
-}
+// This (filter = 1, distance = 3) implementation doesn't actually bench faster
+// than the non-SIMD one.
+//
+// pri func decoder.filter_1_distance_3_sse42!(curr: slice base.u8),
+// choose cpu_arch >= x86_sse42,
+// {
+// var c : slice base.u8
+// var x128 : base.x86_m128i
+// var a128 : base.x86_m128i
+//
+// iterate (c = args.curr)(length: 4, advance: 3, unroll: 2) {
+// x128.load_u32!(a: c.peek_u32le())
+// x128 = x128._mm_add_epi8(b: a128)
+// a128 = x128
+// c.poke_u24le!(a: x128.truncate_u32())
+// } else (length: 3, advance: 3, unroll: 1) {
+// x128.load_u32!(a: c.peek_u24le_as_u32())
+// x128 = x128._mm_add_epi8(b: a128)
+// c.poke_u24le!(a: x128.truncate_u32())
+// }
+// }
pri func decoder.filter_1_distance_4_sse42!(curr: slice base.u8),
choose cpu_arch >= x86_sse42,
{
var c : slice base.u8
var x128 : base.x86_m128i
- var i128 : base.x86_m128i
- var j128 : base.x86_m128i
- var k128 : base.x86_m128i
var a128 : base.x86_m128i
- iterate (c = args.curr)(length: 16, advance: 16, unroll: 1) {
- // x128 = [x0, x1, x2, x3]
- x128.load_slice128!(a: c)
-
- // i128 = [a+x0, x1, x2, x3]
- i128 = x128._mm_add_epi8(b: a128)
-
- // j128 = [0, a+x0, x1, x2]
- j128 = i128._mm_slli_si128(imm8: 4)
-
- // j128 = [a+x0, a+x0+x1, x1+x2, x2+x3]
- j128 = j128._mm_add_epi8(b: i128)
-
- // k128 = [0, 0, a+x0, a+x0+x1]
- k128 = j128._mm_slli_si128(imm8: 8)
-
- // k128 = [a+x0, a+x0+x1, a+x0+x1+x2, a+x0+x1+x2+x3]
- k128 = k128._mm_add_epi8(b: j128)
-
- // Store.
- k128.store_slice128!(a: c)
-
- // a128 = [a+x0+x1+x2+x3, 0, 0, 0]
- a128 = k128._mm_srli_si128(imm8: 12)
-
- } else (length: 4, advance: 4, unroll: 1) {
+ iterate (c = args.curr)(length: 4, advance: 4, unroll: 2) {
x128.load_u32!(a: c.peek_u32le())
x128 = x128._mm_add_epi8(b: a128)
a128 = x128
diff --git a/std/png/decode_png.wuffs b/std/png/decode_png.wuffs
index e261b0c..d2f0cb2 100644
--- a/std/png/decode_png.wuffs
+++ b/std/png/decode_png.wuffs
@@ -310,9 +310,7 @@
// Filter 0 is a no-op. Filter 2, the up filter, should already vectorize
// easily by a good optimizing C compiler.
if this.filter_distance == 3 {
- choose filter_1 = [
- filter_1_distance_3_sse42,
- filter_1_distance_3_fallback]
+ choose filter_1 = [filter_1_distance_3_fallback]
choose filter_3 = [filter_3_distance_3_fallback]
choose filter_4 = [
filter_4_distance_3_sse42,