Unroll some std/png fallback loops
wuffs_png_decode_filt_1_dist_3/clang9 1.57GB/s ± 0% 1.84GB/s ± 0% +16.85% (p=0.008 n=5+5)
wuffs_png_decode_filt_3_dist_3/clang9 909MB/s ± 0% 1153MB/s ± 0% +26.92% (p=0.008 n=5+5)
wuffs_png_decode_image_40k_24bpp/clang9 127MB/s ± 0% 129MB/s ± 0% +1.56% (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/clang9 128MB/s ± 0% 131MB/s ± 0% +2.05% (p=0.008 n=5+5)
wuffs_png_decode_filt_1_dist_3/gcc10 1.84GB/s ± 0% 1.85GB/s ± 0% ~ (p=0.095 n=5+5)
wuffs_png_decode_filt_3_dist_3/gcc10 1.08GB/s ± 0% 1.14GB/s ± 1% +5.87% (p=0.008 n=5+5)
wuffs_png_decode_image_40k_24bpp/gcc10 131MB/s ± 0% 132MB/s ± 0% +0.46% (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/gcc10 131MB/s ± 0% 132MB/s ± 0% +0.66% (p=0.008 n=5+5)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 4fb5fea..708040f 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -30517,7 +30517,7 @@
wuffs_base__slice_u8 i_slice_c = a_curr;
v_c.ptr = i_slice_c.ptr;
v_c.len = 3;
- uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3);
+ uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 6) * 6);
while (v_c.ptr < i_end0_c) {
v_fa0 += v_c.ptr[0];
v_c.ptr[0] = v_fa0;
@@ -30526,6 +30526,24 @@
v_fa2 += v_c.ptr[2];
v_c.ptr[2] = v_fa2;
v_c.ptr += 3;
+ v_fa0 += v_c.ptr[0];
+ v_c.ptr[0] = v_fa0;
+ v_fa1 += v_c.ptr[1];
+ v_c.ptr[1] = v_fa1;
+ v_fa2 += v_c.ptr[2];
+ v_c.ptr[2] = v_fa2;
+ v_c.ptr += 3;
+ }
+ v_c.len = 3;
+ uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3);
+ while (v_c.ptr < i_end1_c) {
+ v_fa0 += v_c.ptr[0];
+ v_c.ptr[0] = v_fa0;
+ v_fa1 += v_c.ptr[1];
+ v_c.ptr[1] = v_fa1;
+ v_fa2 += v_c.ptr[2];
+ v_c.ptr[2] = v_fa2;
+ v_c.ptr += 3;
}
v_c.len = 0;
}
@@ -30652,7 +30670,7 @@
wuffs_base__slice_u8 i_slice_c = a_curr;
v_c.ptr = i_slice_c.ptr;
v_c.len = 3;
- uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3);
+ uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 6) * 6);
while (v_c.ptr < i_end0_c) {
v_fa0 = ((v_fa0 / 2) + v_c.ptr[0]);
v_c.ptr[0] = v_fa0;
@@ -30661,6 +30679,24 @@
v_fa2 = ((v_fa2 / 2) + v_c.ptr[2]);
v_c.ptr[2] = v_fa2;
v_c.ptr += 3;
+ v_fa0 = ((v_fa0 / 2) + v_c.ptr[0]);
+ v_c.ptr[0] = v_fa0;
+ v_fa1 = ((v_fa1 / 2) + v_c.ptr[1]);
+ v_c.ptr[1] = v_fa1;
+ v_fa2 = ((v_fa2 / 2) + v_c.ptr[2]);
+ v_c.ptr[2] = v_fa2;
+ v_c.ptr += 3;
+ }
+ v_c.len = 3;
+ uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3);
+ while (v_c.ptr < i_end1_c) {
+ v_fa0 = ((v_fa0 / 2) + v_c.ptr[0]);
+ v_c.ptr[0] = v_fa0;
+ v_fa1 = ((v_fa1 / 2) + v_c.ptr[1]);
+ v_c.ptr[1] = v_fa1;
+ v_fa2 = ((v_fa2 / 2) + v_c.ptr[2]);
+ v_c.ptr[2] = v_fa2;
+ v_c.ptr += 3;
}
v_c.len = 0;
}
@@ -30673,7 +30709,7 @@
i_slice_c.len = ((size_t)(wuffs_base__u64__min(i_slice_c.len, i_slice_p.len)));
v_c.len = 3;
v_p.len = 3;
- uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3);
+ uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 6) * 6);
while (v_c.ptr < i_end0_c) {
v_fa0 = (((uint8_t)(((((uint32_t)(v_fa0)) + ((uint32_t)(v_p.ptr[0]))) / 2))) + v_c.ptr[0]);
v_c.ptr[0] = v_fa0;
@@ -30683,6 +30719,27 @@
v_c.ptr[2] = v_fa2;
v_c.ptr += 3;
v_p.ptr += 3;
+ v_fa0 = (((uint8_t)(((((uint32_t)(v_fa0)) + ((uint32_t)(v_p.ptr[0]))) / 2))) + v_c.ptr[0]);
+ v_c.ptr[0] = v_fa0;
+ v_fa1 = (((uint8_t)(((((uint32_t)(v_fa1)) + ((uint32_t)(v_p.ptr[1]))) / 2))) + v_c.ptr[1]);
+ v_c.ptr[1] = v_fa1;
+ v_fa2 = (((uint8_t)(((((uint32_t)(v_fa2)) + ((uint32_t)(v_p.ptr[2]))) / 2))) + v_c.ptr[2]);
+ v_c.ptr[2] = v_fa2;
+ v_c.ptr += 3;
+ v_p.ptr += 3;
+ }
+ v_c.len = 3;
+ v_p.len = 3;
+ uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3);
+ while (v_c.ptr < i_end1_c) {
+ v_fa0 = (((uint8_t)(((((uint32_t)(v_fa0)) + ((uint32_t)(v_p.ptr[0]))) / 2))) + v_c.ptr[0]);
+ v_c.ptr[0] = v_fa0;
+ v_fa1 = (((uint8_t)(((((uint32_t)(v_fa1)) + ((uint32_t)(v_p.ptr[1]))) / 2))) + v_c.ptr[1]);
+ v_c.ptr[1] = v_fa1;
+ v_fa2 = (((uint8_t)(((((uint32_t)(v_fa2)) + ((uint32_t)(v_p.ptr[2]))) / 2))) + v_c.ptr[2]);
+ v_c.ptr[2] = v_fa2;
+ v_c.ptr += 3;
+ v_p.ptr += 3;
}
v_c.len = 0;
v_p.len = 0;
diff --git a/std/png/decode_filter_fallback.wuffs b/std/png/decode_filter_fallback.wuffs
index ac10233..930a475 100644
--- a/std/png/decode_filter_fallback.wuffs
+++ b/std/png/decode_filter_fallback.wuffs
@@ -47,7 +47,7 @@
var fa1 : base.u8
var fa2 : base.u8
- iterate (c = args.curr)(length: 3, advance: 3, unroll: 1) {
+ iterate (c = args.curr)(length: 3, advance: 3, unroll: 2) {
fa0 ~mod+= c[0]
c[0] = fa0
fa1 ~mod+= c[1]
@@ -163,7 +163,7 @@
var fa2 : base.u8
if args.prev.length() == 0 {
- iterate (c = args.curr)(length: 3, advance: 3, unroll: 1) {
+ iterate (c = args.curr)(length: 3, advance: 3, unroll: 2) {
fa0 = (fa0 / 2) ~mod+ c[0]
c[0] = fa0
fa1 = (fa1 / 2) ~mod+ c[1]
@@ -173,7 +173,7 @@
}
} else {
- iterate (c = args.curr, p = args.prev)(length: 3, advance: 3, unroll: 1) {
+ iterate (c = args.curr, p = args.prev)(length: 3, advance: 3, unroll: 2) {
fa0 = ((((fa0 as base.u32) + (p[0] as base.u32)) / 2) as base.u8) ~mod+ c[0]
c[0] = fa0
fa1 = ((((fa1 as base.u32) + (p[1] as base.u32)) / 2) as base.u8) ~mod+ c[1]