Unroll some std/png fallback loops

wuffs_png_decode_filt_1_dist_3/clang9                     1.57GB/s ± 0%   1.84GB/s ± 0%  +16.85%  (p=0.008 n=5+5)
wuffs_png_decode_filt_3_dist_3/clang9                      909MB/s ± 0%   1153MB/s ± 0%  +26.92%  (p=0.008 n=5+5)

wuffs_png_decode_image_40k_24bpp/clang9                    127MB/s ± 0%    129MB/s ± 0%   +1.56%  (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/clang9                  128MB/s ± 0%    131MB/s ± 0%   +2.05%  (p=0.008 n=5+5)

wuffs_png_decode_filt_1_dist_3/gcc10                      1.84GB/s ± 0%   1.85GB/s ± 0%     ~     (p=0.095 n=5+5)
wuffs_png_decode_filt_3_dist_3/gcc10                      1.08GB/s ± 0%   1.14GB/s ± 1%   +5.87%  (p=0.008 n=5+5)

wuffs_png_decode_image_40k_24bpp/gcc10                     131MB/s ± 0%    132MB/s ± 0%   +0.46%  (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/gcc10                   131MB/s ± 0%    132MB/s ± 0%   +0.66%  (p=0.008 n=5+5)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 4fb5fea..708040f 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -30517,7 +30517,7 @@
     wuffs_base__slice_u8 i_slice_c = a_curr;
     v_c.ptr = i_slice_c.ptr;
     v_c.len = 3;
-    uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3);
+    uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 6) * 6);
     while (v_c.ptr < i_end0_c) {
       v_fa0 += v_c.ptr[0];
       v_c.ptr[0] = v_fa0;
@@ -30526,6 +30526,24 @@
       v_fa2 += v_c.ptr[2];
       v_c.ptr[2] = v_fa2;
       v_c.ptr += 3;
+      v_fa0 += v_c.ptr[0];
+      v_c.ptr[0] = v_fa0;
+      v_fa1 += v_c.ptr[1];
+      v_c.ptr[1] = v_fa1;
+      v_fa2 += v_c.ptr[2];
+      v_c.ptr[2] = v_fa2;
+      v_c.ptr += 3;
+    }
+    v_c.len = 3;
+    uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3);
+    while (v_c.ptr < i_end1_c) {
+      v_fa0 += v_c.ptr[0];
+      v_c.ptr[0] = v_fa0;
+      v_fa1 += v_c.ptr[1];
+      v_c.ptr[1] = v_fa1;
+      v_fa2 += v_c.ptr[2];
+      v_c.ptr[2] = v_fa2;
+      v_c.ptr += 3;
     }
     v_c.len = 0;
   }
@@ -30652,7 +30670,7 @@
       wuffs_base__slice_u8 i_slice_c = a_curr;
       v_c.ptr = i_slice_c.ptr;
       v_c.len = 3;
-      uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3);
+      uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 6) * 6);
       while (v_c.ptr < i_end0_c) {
         v_fa0 = ((v_fa0 / 2) + v_c.ptr[0]);
         v_c.ptr[0] = v_fa0;
@@ -30661,6 +30679,24 @@
         v_fa2 = ((v_fa2 / 2) + v_c.ptr[2]);
         v_c.ptr[2] = v_fa2;
         v_c.ptr += 3;
+        v_fa0 = ((v_fa0 / 2) + v_c.ptr[0]);
+        v_c.ptr[0] = v_fa0;
+        v_fa1 = ((v_fa1 / 2) + v_c.ptr[1]);
+        v_c.ptr[1] = v_fa1;
+        v_fa2 = ((v_fa2 / 2) + v_c.ptr[2]);
+        v_c.ptr[2] = v_fa2;
+        v_c.ptr += 3;
+      }
+      v_c.len = 3;
+      uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3);
+      while (v_c.ptr < i_end1_c) {
+        v_fa0 = ((v_fa0 / 2) + v_c.ptr[0]);
+        v_c.ptr[0] = v_fa0;
+        v_fa1 = ((v_fa1 / 2) + v_c.ptr[1]);
+        v_c.ptr[1] = v_fa1;
+        v_fa2 = ((v_fa2 / 2) + v_c.ptr[2]);
+        v_c.ptr[2] = v_fa2;
+        v_c.ptr += 3;
       }
       v_c.len = 0;
     }
@@ -30673,7 +30709,7 @@
       i_slice_c.len = ((size_t)(wuffs_base__u64__min(i_slice_c.len, i_slice_p.len)));
       v_c.len = 3;
       v_p.len = 3;
-      uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3);
+      uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 6) * 6);
       while (v_c.ptr < i_end0_c) {
         v_fa0 = (((uint8_t)(((((uint32_t)(v_fa0)) + ((uint32_t)(v_p.ptr[0]))) / 2))) + v_c.ptr[0]);
         v_c.ptr[0] = v_fa0;
@@ -30683,6 +30719,27 @@
         v_c.ptr[2] = v_fa2;
         v_c.ptr += 3;
         v_p.ptr += 3;
+        v_fa0 = (((uint8_t)(((((uint32_t)(v_fa0)) + ((uint32_t)(v_p.ptr[0]))) / 2))) + v_c.ptr[0]);
+        v_c.ptr[0] = v_fa0;
+        v_fa1 = (((uint8_t)(((((uint32_t)(v_fa1)) + ((uint32_t)(v_p.ptr[1]))) / 2))) + v_c.ptr[1]);
+        v_c.ptr[1] = v_fa1;
+        v_fa2 = (((uint8_t)(((((uint32_t)(v_fa2)) + ((uint32_t)(v_p.ptr[2]))) / 2))) + v_c.ptr[2]);
+        v_c.ptr[2] = v_fa2;
+        v_c.ptr += 3;
+        v_p.ptr += 3;
+      }
+      v_c.len = 3;
+      v_p.len = 3;
+      uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3);
+      while (v_c.ptr < i_end1_c) {
+        v_fa0 = (((uint8_t)(((((uint32_t)(v_fa0)) + ((uint32_t)(v_p.ptr[0]))) / 2))) + v_c.ptr[0]);
+        v_c.ptr[0] = v_fa0;
+        v_fa1 = (((uint8_t)(((((uint32_t)(v_fa1)) + ((uint32_t)(v_p.ptr[1]))) / 2))) + v_c.ptr[1]);
+        v_c.ptr[1] = v_fa1;
+        v_fa2 = (((uint8_t)(((((uint32_t)(v_fa2)) + ((uint32_t)(v_p.ptr[2]))) / 2))) + v_c.ptr[2]);
+        v_c.ptr[2] = v_fa2;
+        v_c.ptr += 3;
+        v_p.ptr += 3;
       }
       v_c.len = 0;
       v_p.len = 0;
diff --git a/std/png/decode_filter_fallback.wuffs b/std/png/decode_filter_fallback.wuffs
index ac10233..930a475 100644
--- a/std/png/decode_filter_fallback.wuffs
+++ b/std/png/decode_filter_fallback.wuffs
@@ -47,7 +47,7 @@
 	var fa1 : base.u8
 	var fa2 : base.u8
 
-	iterate (c = args.curr)(length: 3, advance: 3, unroll: 1) {
+	iterate (c = args.curr)(length: 3, advance: 3, unroll: 2) {
 		fa0 ~mod+= c[0]
 		c[0] = fa0
 		fa1 ~mod+= c[1]
@@ -163,7 +163,7 @@
 	var fa2 : base.u8
 
 	if args.prev.length() == 0 {
-		iterate (c = args.curr)(length: 3, advance: 3, unroll: 1) {
+		iterate (c = args.curr)(length: 3, advance: 3, unroll: 2) {
 			fa0 = (fa0 / 2) ~mod+ c[0]
 			c[0] = fa0
 			fa1 = (fa1 / 2) ~mod+ c[1]
@@ -173,7 +173,7 @@
 		}
 
 	} else {
-		iterate (c = args.curr, p = args.prev)(length: 3, advance: 3, unroll: 1) {
+		iterate (c = args.curr, p = args.prev)(length: 3, advance: 3, unroll: 2) {
 			fa0 = ((((fa0 as base.u32) + (p[0] as base.u32)) / 2) as base.u8) ~mod+ c[0]
 			c[0] = fa0
 			fa1 = ((((fa1 as base.u32) + (p[1] as base.u32)) / 2) as base.u8) ~mod+ c[1]