Unroll some std/png sse128 loops
wuffs_png_decode_filt_1_dist_3/clang9 1.57GB/s ± 0% 1.57GB/s ± 0% ~ (p=0.151 n=5+5)
wuffs_png_decode_filt_1_dist_4/clang9 4.62GB/s ± 0% 5.87GB/s ± 1% +26.93% (p=0.008 n=5+5)
wuffs_png_decode_filt_2_dist_3/clang9 13.3GB/s ± 0% 13.1GB/s ± 1% -1.52% (p=0.008 n=5+5)
wuffs_png_decode_filt_2_dist_4/clang9 13.3GB/s ± 0% 13.1GB/s ± 1% -1.50% (p=0.008 n=5+5)
wuffs_png_decode_filt_3_dist_3/clang9 910MB/s ± 0% 909MB/s ± 0% ~ (p=0.095 n=5+5)
wuffs_png_decode_filt_3_dist_4/clang9 1.78GB/s ± 0% 1.84GB/s ± 0% +3.65% (p=0.008 n=5+5)
wuffs_png_decode_filt_4_dist_3/clang9 542MB/s ± 0% 571MB/s ± 0% +5.31% (p=0.016 n=4+5)
wuffs_png_decode_filt_4_dist_4/clang9 844MB/s ± 0% 819MB/s ± 0% -3.00% (p=0.008 n=5+5)
wuffs_png_decode_image_40k_24bpp/clang9 127MB/s ± 0% 127MB/s ± 0% ~ (p=0.310 n=5+5)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/clang9 326MB/s ± 0% 322MB/s ± 0% -1.19% (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/clang9 259MB/s ± 0% 256MB/s ± 0% -1.31% (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/clang9 128MB/s ± 0% 128MB/s ± 0% ~ (p=0.278 n=5+5)
wuffs_png_decode_filt_1_dist_3/gcc10 1.84GB/s ± 0% 1.84GB/s ± 0% ~ (p=0.190 n=4+5)
wuffs_png_decode_filt_1_dist_4/gcc10 3.38GB/s ± 1% 5.41GB/s ± 1% +59.98% (p=0.008 n=5+5)
wuffs_png_decode_filt_2_dist_3/gcc10 8.29GB/s ± 0% 8.30GB/s ± 0% ~ (p=0.222 n=5+5)
wuffs_png_decode_filt_2_dist_4/gcc10 8.28GB/s ± 0% 8.30GB/s ± 0% +0.24% (p=0.008 n=5+5)
wuffs_png_decode_filt_3_dist_3/gcc10 1.07GB/s ± 0% 1.08GB/s ± 0% ~ (p=0.421 n=5+5)
wuffs_png_decode_filt_3_dist_4/gcc10 2.13GB/s ± 0% 2.36GB/s ± 0% +10.74% (p=0.008 n=5+5)
wuffs_png_decode_filt_4_dist_3/gcc10 587MB/s ± 0% 616MB/s ± 0% +4.95% (p=0.016 n=4+5)
wuffs_png_decode_filt_4_dist_4/gcc10 796MB/s ± 0% 871MB/s ± 0% +9.44% (p=0.008 n=5+5)
wuffs_png_decode_image_40k_24bpp/gcc10 130MB/s ± 0% 131MB/s ± 0% +0.78% (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/gcc10 320MB/s ± 0% 325MB/s ± 0% +1.69% (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/gcc10 270MB/s ± 0% 273MB/s ± 0% +1.30% (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/gcc10 131MB/s ± 0% 131MB/s ± 0% +0.40% (p=0.008 n=5+5)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 3d8aff4..4fb5fea 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -31102,13 +31102,27 @@
wuffs_base__slice_u8 i_slice_c = a_curr;
v_c.ptr = i_slice_c.ptr;
v_c.len = 4;
- uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
+ uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 8) * 8);
while (v_c.ptr < i_end0_c) {
v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
v_x128 = _mm_add_epi8(v_x128, v_a128);
v_a128 = v_x128;
wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
v_c.ptr += 4;
+ v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_x128 = _mm_add_epi8(v_x128, v_a128);
+ v_a128 = v_x128;
+ wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+ v_c.ptr += 4;
+ }
+ v_c.len = 4;
+ uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
+ while (v_c.ptr < i_end1_c) {
+ v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_x128 = _mm_add_epi8(v_x128, v_a128);
+ v_a128 = v_x128;
+ wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+ v_c.ptr += 4;
}
v_c.len = 0;
}
@@ -31141,7 +31155,7 @@
wuffs_base__slice_u8 i_slice_c = a_curr;
v_c.ptr = i_slice_c.ptr;
v_c.len = 4;
- uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
+ uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 8) * 8);
while (v_c.ptr < i_end0_c) {
v_p128 = _mm_avg_epu8(_mm_and_si128(v_a128, v_k128), v_b128);
v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
@@ -31149,6 +31163,22 @@
v_a128 = v_x128;
wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
v_c.ptr += 4;
+ v_p128 = _mm_avg_epu8(_mm_and_si128(v_a128, v_k128), v_b128);
+ v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_x128 = _mm_add_epi8(v_x128, v_p128);
+ v_a128 = v_x128;
+ wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+ v_c.ptr += 4;
+ }
+ v_c.len = 4;
+ uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
+ while (v_c.ptr < i_end1_c) {
+ v_p128 = _mm_avg_epu8(_mm_and_si128(v_a128, v_k128), v_b128);
+ v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_x128 = _mm_add_epi8(v_x128, v_p128);
+ v_a128 = v_x128;
+ wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+ v_c.ptr += 4;
}
v_c.len = 0;
}
@@ -31162,7 +31192,7 @@
i_slice_c.len = ((size_t)(wuffs_base__u64__min(i_slice_c.len, i_slice_p.len)));
v_c.len = 4;
v_p.len = 4;
- uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
+ uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 8) * 8);
while (v_c.ptr < i_end0_c) {
v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
v_p128 = _mm_avg_epu8(v_a128, v_b128);
@@ -31173,6 +31203,29 @@
wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
v_c.ptr += 4;
v_p.ptr += 4;
+ v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+ v_p128 = _mm_avg_epu8(v_a128, v_b128);
+ v_p128 = _mm_sub_epi8(v_p128, _mm_and_si128(v_k128, _mm_xor_si128(v_a128, v_b128)));
+ v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_x128 = _mm_add_epi8(v_x128, v_p128);
+ v_a128 = v_x128;
+ wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+ v_c.ptr += 4;
+ v_p.ptr += 4;
+ }
+ v_c.len = 4;
+ v_p.len = 4;
+ uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
+ while (v_c.ptr < i_end1_c) {
+ v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+ v_p128 = _mm_avg_epu8(v_a128, v_b128);
+ v_p128 = _mm_sub_epi8(v_p128, _mm_and_si128(v_k128, _mm_xor_si128(v_a128, v_b128)));
+ v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_x128 = _mm_add_epi8(v_x128, v_p128);
+ v_a128 = v_x128;
+ wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+ v_c.ptr += 4;
+ v_p.ptr += 4;
}
v_c.len = 0;
v_p.len = 0;
@@ -31214,7 +31267,7 @@
i_slice_c.len = ((size_t)(wuffs_base__u64__min(i_slice_c.len, i_slice_p.len)));
v_c.len = 4;
v_p.len = 4;
- uint8_t* i_end0_c = v_c.ptr + wuffs_base__iterate_total_advance((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)), 4, 3);
+ uint8_t* i_end0_c = v_c.ptr + wuffs_base__iterate_total_advance((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)), 7, 6);
while (v_c.ptr < i_end0_c) {
v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
v_b128 = _mm_unpacklo_epi8(v_b128, v_z128);
@@ -31235,11 +31288,54 @@
wuffs_base__poke_u24le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
v_c.ptr += 3;
v_p.ptr += 3;
+ v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+ v_b128 = _mm_unpacklo_epi8(v_b128, v_z128);
+ v_pa128 = _mm_sub_epi16(v_b128, v_c128);
+ v_pb128 = _mm_sub_epi16(v_a128, v_c128);
+ v_pc128 = _mm_add_epi16(v_pa128, v_pb128);
+ v_pa128 = _mm_abs_epi16(v_pa128);
+ v_pb128 = _mm_abs_epi16(v_pb128);
+ v_pc128 = _mm_abs_epi16(v_pc128);
+ v_smallest128 = _mm_min_epi16(v_pc128, _mm_min_epi16(v_pb128, v_pa128));
+ v_p128 = _mm_blendv_epi8(_mm_blendv_epi8(v_c128, v_b128, _mm_cmpeq_epi16(v_smallest128, v_pb128)), v_a128, _mm_cmpeq_epi16(v_smallest128, v_pa128));
+ v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_x128 = _mm_unpacklo_epi8(v_x128, v_z128);
+ v_x128 = _mm_add_epi8(v_x128, v_p128);
+ v_a128 = v_x128;
+ v_c128 = v_b128;
+ v_x128 = _mm_packus_epi16(v_x128, v_x128);
+ wuffs_base__poke_u24le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+ v_c.ptr += 3;
+ v_p.ptr += 3;
+ }
+ v_c.len = 4;
+ v_p.len = 4;
+ uint8_t* i_end1_c = v_c.ptr + wuffs_base__iterate_total_advance((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)), 4, 3);
+ while (v_c.ptr < i_end1_c) {
+ v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+ v_b128 = _mm_unpacklo_epi8(v_b128, v_z128);
+ v_pa128 = _mm_sub_epi16(v_b128, v_c128);
+ v_pb128 = _mm_sub_epi16(v_a128, v_c128);
+ v_pc128 = _mm_add_epi16(v_pa128, v_pb128);
+ v_pa128 = _mm_abs_epi16(v_pa128);
+ v_pb128 = _mm_abs_epi16(v_pb128);
+ v_pc128 = _mm_abs_epi16(v_pc128);
+ v_smallest128 = _mm_min_epi16(v_pc128, _mm_min_epi16(v_pb128, v_pa128));
+ v_p128 = _mm_blendv_epi8(_mm_blendv_epi8(v_c128, v_b128, _mm_cmpeq_epi16(v_smallest128, v_pb128)), v_a128, _mm_cmpeq_epi16(v_smallest128, v_pa128));
+ v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_x128 = _mm_unpacklo_epi8(v_x128, v_z128);
+ v_x128 = _mm_add_epi8(v_x128, v_p128);
+ v_a128 = v_x128;
+ v_c128 = v_b128;
+ v_x128 = _mm_packus_epi16(v_x128, v_x128);
+ wuffs_base__poke_u24le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+ v_c.ptr += 3;
+ v_p.ptr += 3;
}
v_c.len = 3;
v_p.len = 3;
- uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3);
- while (v_c.ptr < i_end1_c) {
+ uint8_t* i_end2_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3);
+ while (v_c.ptr < i_end2_c) {
v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u24le__no_bounds_check(v_p.ptr)));
v_b128 = _mm_unpacklo_epi8(v_b128, v_z128);
v_pa128 = _mm_sub_epi16(v_b128, v_c128);
@@ -31297,7 +31393,7 @@
i_slice_c.len = ((size_t)(wuffs_base__u64__min(i_slice_c.len, i_slice_p.len)));
v_c.len = 4;
v_p.len = 4;
- uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
+ uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 8) * 8);
while (v_c.ptr < i_end0_c) {
v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
v_b128 = _mm_unpacklo_epi8(v_b128, v_z128);
@@ -31318,6 +31414,49 @@
wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
v_c.ptr += 4;
v_p.ptr += 4;
+ v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+ v_b128 = _mm_unpacklo_epi8(v_b128, v_z128);
+ v_pa128 = _mm_sub_epi16(v_b128, v_c128);
+ v_pb128 = _mm_sub_epi16(v_a128, v_c128);
+ v_pc128 = _mm_add_epi16(v_pa128, v_pb128);
+ v_pa128 = _mm_abs_epi16(v_pa128);
+ v_pb128 = _mm_abs_epi16(v_pb128);
+ v_pc128 = _mm_abs_epi16(v_pc128);
+ v_smallest128 = _mm_min_epi16(v_pc128, _mm_min_epi16(v_pb128, v_pa128));
+ v_p128 = _mm_blendv_epi8(_mm_blendv_epi8(v_c128, v_b128, _mm_cmpeq_epi16(v_smallest128, v_pb128)), v_a128, _mm_cmpeq_epi16(v_smallest128, v_pa128));
+ v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_x128 = _mm_unpacklo_epi8(v_x128, v_z128);
+ v_x128 = _mm_add_epi8(v_x128, v_p128);
+ v_a128 = v_x128;
+ v_c128 = v_b128;
+ v_x128 = _mm_packus_epi16(v_x128, v_x128);
+ wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+ v_c.ptr += 4;
+ v_p.ptr += 4;
+ }
+ v_c.len = 4;
+ v_p.len = 4;
+ uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
+ while (v_c.ptr < i_end1_c) {
+ v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+ v_b128 = _mm_unpacklo_epi8(v_b128, v_z128);
+ v_pa128 = _mm_sub_epi16(v_b128, v_c128);
+ v_pb128 = _mm_sub_epi16(v_a128, v_c128);
+ v_pc128 = _mm_add_epi16(v_pa128, v_pb128);
+ v_pa128 = _mm_abs_epi16(v_pa128);
+ v_pb128 = _mm_abs_epi16(v_pb128);
+ v_pc128 = _mm_abs_epi16(v_pc128);
+ v_smallest128 = _mm_min_epi16(v_pc128, _mm_min_epi16(v_pb128, v_pa128));
+ v_p128 = _mm_blendv_epi8(_mm_blendv_epi8(v_c128, v_b128, _mm_cmpeq_epi16(v_smallest128, v_pb128)), v_a128, _mm_cmpeq_epi16(v_smallest128, v_pa128));
+ v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+ v_x128 = _mm_unpacklo_epi8(v_x128, v_z128);
+ v_x128 = _mm_add_epi8(v_x128, v_p128);
+ v_a128 = v_x128;
+ v_c128 = v_b128;
+ v_x128 = _mm_packus_epi16(v_x128, v_x128);
+ wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+ v_c.ptr += 4;
+ v_p.ptr += 4;
}
v_c.len = 0;
v_p.len = 0;
diff --git a/std/png/decode_filter_sse128.wuffs b/std/png/decode_filter_sse128.wuffs
index 5d447d5..1e911f8 100644
--- a/std/png/decode_filter_sse128.wuffs
+++ b/std/png/decode_filter_sse128.wuffs
@@ -26,7 +26,7 @@
// var x128 : base.sse128_i
// var a128 : base.sse128_i
//
-// iterate (c = args.curr)(length: 4, advance: 3, unroll: 1) {
+// iterate (c = args.curr)(length: 4, advance: 3, unroll: 2) {
// x128.load_u32!(a: c.peek_u32le())
// x128 = x128._mm_add_epi8!(b: a128)
// a128 = x128
@@ -45,7 +45,7 @@
var x128 : base.sse128_i
var a128 : base.sse128_i
- iterate (c = args.curr)(length: 4, advance: 4, unroll: 1) {
+ iterate (c = args.curr)(length: 4, advance: 4, unroll: 2) {
x128.load_u32!(a: c.peek_u32le())
x128 = x128._mm_add_epi8(b: a128)
a128 = x128
@@ -79,7 +79,7 @@
if args.prev.length() == 0 {
k128 = k128.create_mm_set1_epi8(a: 0xFE)
- iterate (c = args.curr)(length: 4, advance: 4, unroll: 1) {
+ iterate (c = args.curr)(length: 4, advance: 4, unroll: 2) {
// The predictor, p128, is just half (rounded down) of the previous
// pixel, a128. In this branch, b128 stays zero so the average of
// a128 and b128 is just half of a128. _mm_avg_epu8 rounds up, but
@@ -96,7 +96,7 @@
} else {
k128 = k128.create_mm_set1_epi8(a: 0x01)
- iterate (c = args.curr, p = args.prev)(length: 4, advance: 4, unroll: 1) {
+ iterate (c = args.curr, p = args.prev)(length: 4, advance: 4, unroll: 2) {
// Load the pixel from the row above.
b128.load_u32!(a: p.peek_u32le())
@@ -147,7 +147,7 @@
var z128 : base.sse128_i
// § The advance is 3, not 4.
- iterate (c = args.curr, p = args.prev)(length: 4, advance: 3, unroll: 1) {
+ iterate (c = args.curr, p = args.prev)(length: 4, advance: 3, unroll: 2) {
b128.load_u32!(a: p.peek_u32le())
b128 = b128._mm_unpacklo_epi8(b: z128)
pa128 = b128._mm_sub_epi16(b: c128)
@@ -217,7 +217,7 @@
var smallest128 : base.sse128_i
var z128 : base.sse128_i
- iterate (c = args.curr, p = args.prev)(length: 4, advance: 4, unroll: 1) {
+ iterate (c = args.curr, p = args.prev)(length: 4, advance: 4, unroll: 2) {
// Load the pixel from the row above.
b128.load_u32!(a: p.peek_u32le())