Let iterate have multiple assigns

wuffs_png_decode_19k_8bpp/clang9          92.2MB/s ± 0%  93.4MB/s ± 0%   +1.33%  (p=0.008 n=5+5)
wuffs_png_decode_40k_24bpp/clang9          116MB/s ± 0%   115MB/s ± 0%     ~     (p=1.000 n=5+5)
wuffs_png_decode_77k_8bpp/clang9           347MB/s ± 0%   351MB/s ± 0%   +1.19%  (p=0.016 n=5+4)
wuffs_png_decode_552k_32bpp/clang9         150MB/s ± 0%   156MB/s ± 0%   +3.57%  (p=0.008 n=5+5)
wuffs_png_decode_4002k_24bpp/clang9        117MB/s ± 1%   118MB/s ± 0%   +0.62%  (p=0.008 n=5+5)

wuffs_png_decode_filter_1_sub/clang9      1.34GB/s ± 0%  1.24GB/s ± 0%   -7.30%  (p=0.008 n=5+5)
wuffs_png_decode_filter_2_up/clang9       13.3GB/s ± 0%  13.4GB/s ± 0%   +0.96%  (p=0.008 n=5+5)
wuffs_png_decode_filter_3_average/clang9   754MB/s ± 0%   948MB/s ± 0%  +25.62%  (p=0.008 n=5+5)
wuffs_png_decode_filter_4_paeth/clang9     127MB/s ± 0%   138MB/s ± 0%   +9.01%  (p=0.008 n=5+5)

wuffs_png_decode_19k_8bpp/gcc10            100MB/s ± 1%   100MB/s ± 0%     ~     (p=0.690 n=5+5)
wuffs_png_decode_40k_24bpp/gcc10           114MB/s ± 0%   118MB/s ± 0%   +3.21%  (p=0.008 n=5+5)
wuffs_png_decode_77k_8bpp/gcc10            323MB/s ± 2%   325MB/s ± 0%     ~     (p=0.841 n=5+5)
wuffs_png_decode_552k_32bpp/gcc10          159MB/s ± 0%   172MB/s ± 0%   +8.38%  (p=0.008 n=5+5)
wuffs_png_decode_4002k_24bpp/gcc10         118MB/s ± 0%   118MB/s ± 0%     ~     (p=0.056 n=5+5)

wuffs_png_decode_filter_1_sub/gcc10       1.84GB/s ± 1%  1.85GB/s ± 0%   +0.22%  (p=0.008 n=5+5)
wuffs_png_decode_filter_2_up/gcc10        10.5GB/s ± 0%  10.5GB/s ± 0%     ~     (p=0.310 n=5+5)
wuffs_png_decode_filter_3_average/gcc10    743MB/s ± 0%  1008MB/s ± 0%  +35.66%  (p=0.008 n=5+5)
wuffs_png_decode_filter_4_paeth/gcc10     83.1MB/s ± 0%  94.1MB/s ± 0%  +13.15%  (p=0.008 n=5+5)
diff --git a/internal/cgen/statement.go b/internal/cgen/statement.go
index 170b3be..ca23e0d 100644
--- a/internal/cgen/statement.go
+++ b/internal/cgen/statement.go
@@ -396,22 +396,26 @@
 	if len(assigns) == 0 {
 		return nil
 	}
-	if len(assigns) != 1 {
-		return fmt.Errorf("TODO: iterate over more than one assign")
-	}
-	o := assigns[0].AsAssign()
-	name := o.LHS().Ident().Str(g.tm)
+	name0 := assigns[0].AsAssign().LHS().Ident().Str(g.tm)
 	b.writes("{\n")
 
 	// TODO: don't assume that the slice is a slice of base.u8. In
 	// particular, the code gen can be subtle if the slice element type has
 	// zero size, such as the empty struct.
-	b.printf("wuffs_base__slice_u8 %sslice_%s = ", iPrefix, name)
-	if err := g.writeExpr(b, o.RHS(), 0); err != nil {
-		return err
+	for i, o := range assigns {
+		o := o.AsAssign()
+		name := o.LHS().Ident().Str(g.tm)
+		b.printf("wuffs_base__slice_u8 %sslice_%s = ", iPrefix, name)
+		if err := g.writeExpr(b, o.RHS(), 0); err != nil {
+			return err
+		}
+		b.writes(";\n")
+		b.printf("%s%s = %sslice_%s;\n", vPrefix, name, iPrefix, name)
+		if i > 0 {
+			b.printf("%sslice_%s.len = wuffs_base__u64__min(%sslice_%s.len, %sslice_%s.len);\n",
+				iPrefix, name0, iPrefix, name0, iPrefix, name)
+		}
 	}
-	b.writes(";\n")
-	b.printf("%s%s = %sslice_%s;\n", vPrefix, name, iPrefix, name)
 	// TODO: look at n.HasContinue() and n.HasBreak().
 
 	round := uint32(0)
@@ -425,7 +429,7 @@
 			return err
 		}
 		for {
-			if err := g.writeIterateRound(b, name, n.Body(), round, depth, length, unroll); err != nil {
+			if err := g.writeIterateRound(b, assigns, n.Body(), round, depth, length, unroll); err != nil {
 				return err
 			}
 			round++
@@ -566,18 +570,25 @@
 	return nil
 }
 
-func (g *gen) writeIterateRound(b *buffer, name string, body []*a.Node, round uint32, depth uint32, length int, unroll int) error {
-	b.printf("%s%s.len = %d;\n", vPrefix, name, length)
+func (g *gen) writeIterateRound(b *buffer, assigns []*a.Node, body []*a.Node, round uint32, depth uint32, length int, unroll int) error {
+	for _, o := range assigns {
+		name := o.AsAssign().LHS().Ident().Str(g.tm)
+		b.printf("%s%s.len = %d;\n", vPrefix, name, length)
+	}
+	name0 := assigns[0].AsAssign().LHS().Ident().Str(g.tm)
 	b.printf("uint8_t* %send%d_%s = %sslice_%s.ptr + (%sslice_%s.len / %d) * %d;\n",
-		iPrefix, round, name, iPrefix, name, iPrefix, name, length*unroll, length*unroll)
-	b.printf("while (%s%s.ptr < %send%d_%s) {\n", vPrefix, name, iPrefix, round, name)
+		iPrefix, round, name0, iPrefix, name0, iPrefix, name0, length*unroll, length*unroll)
+	b.printf("while (%s%s.ptr < %send%d_%s) {\n", vPrefix, name0, iPrefix, round, name0)
 	for i := 0; i < unroll; i++ {
 		for _, o := range body {
 			if err := g.writeStatement(b, o, depth); err != nil {
 				return err
 			}
 		}
-		b.printf("%s%s.ptr += %d;\n", vPrefix, name, length)
+		for _, o := range assigns {
+			name := o.AsAssign().LHS().Ident().Str(g.tm)
+			b.printf("%s%s.ptr += %d;\n", vPrefix, name, length)
+		}
 	}
 	b.writes("}\n")
 	return nil
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index ace5131..61c7dda 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -29880,6 +29880,12 @@
     wuffs_base__slice_u8 a_prev);
 
 static wuffs_base__empty_struct
+wuffs_png__decoder__filter_3_distance_4_fallback(
+    wuffs_png__decoder* self,
+    wuffs_base__slice_u8 a_curr,
+    wuffs_base__slice_u8 a_prev);
+
+static wuffs_base__empty_struct
 wuffs_png__decoder__filter_4(
     wuffs_png__decoder* self,
     wuffs_base__slice_u8 a_curr,
@@ -29892,6 +29898,12 @@
     wuffs_base__slice_u8 a_prev);
 
 static wuffs_base__empty_struct
+wuffs_png__decoder__filter_4_distance_4_fallback(
+    wuffs_png__decoder* self,
+    wuffs_base__slice_u8 a_curr,
+    wuffs_base__slice_u8 a_prev);
+
+static wuffs_base__empty_struct
 wuffs_png__decoder__choose_filter_implementations(
     wuffs_png__decoder* self);
 
@@ -30071,27 +30083,27 @@
 wuffs_png__decoder__filter_1_distance_4_fallback(
     wuffs_png__decoder* self,
     wuffs_base__slice_u8 a_curr) {
-  wuffs_base__slice_u8 v_p = {0};
+  wuffs_base__slice_u8 v_c = {0};
   uint8_t v_fa0 = 0;
   uint8_t v_fa1 = 0;
   uint8_t v_fa2 = 0;
   uint8_t v_fa3 = 0;
 
   {
-    wuffs_base__slice_u8 i_slice_p = a_curr;
-    v_p = i_slice_p;
-    v_p.len = 4;
-    uint8_t* i_end0_p = i_slice_p.ptr + (i_slice_p.len / 4) * 4;
-    while (v_p.ptr < i_end0_p) {
-      v_fa0 += v_p.ptr[0];
-      v_p.ptr[0] = v_fa0;
-      v_fa1 += v_p.ptr[1];
-      v_p.ptr[1] = v_fa1;
-      v_fa2 += v_p.ptr[2];
-      v_p.ptr[2] = v_fa2;
-      v_fa3 += v_p.ptr[3];
-      v_p.ptr[3] = v_fa3;
-      v_p.ptr += 4;
+    wuffs_base__slice_u8 i_slice_c = a_curr;
+    v_c = i_slice_c;
+    v_c.len = 4;
+    uint8_t* i_end0_c = i_slice_c.ptr + (i_slice_c.len / 4) * 4;
+    while (v_c.ptr < i_end0_c) {
+      v_fa0 += v_c.ptr[0];
+      v_c.ptr[0] = v_fa0;
+      v_fa1 += v_c.ptr[1];
+      v_c.ptr[1] = v_fa1;
+      v_fa2 += v_c.ptr[2];
+      v_c.ptr[2] = v_fa2;
+      v_fa3 += v_c.ptr[3];
+      v_c.ptr[3] = v_fa3;
+      v_c.ptr += 4;
     }
   }
   return wuffs_base__make_empty_struct();
@@ -30166,6 +30178,65 @@
   return wuffs_base__make_empty_struct();
 }
 
+// -------- func png.decoder.filter_3_distance_4_fallback
+
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_3_distance_4_fallback(
+    wuffs_png__decoder* self,
+    wuffs_base__slice_u8 a_curr,
+    wuffs_base__slice_u8 a_prev) {
+  wuffs_base__slice_u8 v_c = {0};
+  wuffs_base__slice_u8 v_p = {0};
+  uint8_t v_fa0 = 0;
+  uint8_t v_fa1 = 0;
+  uint8_t v_fa2 = 0;
+  uint8_t v_fa3 = 0;
+
+  if (((uint64_t)(a_prev.len)) == 0) {
+    {
+      wuffs_base__slice_u8 i_slice_c = a_curr;
+      v_c = i_slice_c;
+      v_c.len = 4;
+      uint8_t* i_end0_c = i_slice_c.ptr + (i_slice_c.len / 4) * 4;
+      while (v_c.ptr < i_end0_c) {
+        v_fa0 = ((v_fa0 / 2) + v_c.ptr[0]);
+        v_c.ptr[0] = v_fa0;
+        v_fa1 = ((v_fa1 / 2) + v_c.ptr[1]);
+        v_c.ptr[1] = v_fa1;
+        v_fa2 = ((v_fa2 / 2) + v_c.ptr[2]);
+        v_c.ptr[2] = v_fa2;
+        v_fa3 = ((v_fa3 / 2) + v_c.ptr[3]);
+        v_c.ptr[3] = v_fa3;
+        v_c.ptr += 4;
+      }
+    }
+  } else {
+    {
+      wuffs_base__slice_u8 i_slice_c = a_curr;
+      v_c = i_slice_c;
+      wuffs_base__slice_u8 i_slice_p = a_prev;
+      v_p = i_slice_p;
+      i_slice_c.len = wuffs_base__u64__min(i_slice_c.len, i_slice_p.len);
+      v_c.len = 4;
+      v_p.len = 4;
+      uint8_t* i_end0_c = i_slice_c.ptr + (i_slice_c.len / 4) * 4;
+      while (v_c.ptr < i_end0_c) {
+        v_fa0 = (((uint8_t)(((((uint32_t)(v_fa0)) + ((uint32_t)(v_p.ptr[0]))) / 2))) + v_c.ptr[0]);
+        v_c.ptr[0] = v_fa0;
+        v_fa1 = (((uint8_t)(((((uint32_t)(v_fa1)) + ((uint32_t)(v_p.ptr[1]))) / 2))) + v_c.ptr[1]);
+        v_c.ptr[1] = v_fa1;
+        v_fa2 = (((uint8_t)(((((uint32_t)(v_fa2)) + ((uint32_t)(v_p.ptr[2]))) / 2))) + v_c.ptr[2]);
+        v_c.ptr[2] = v_fa2;
+        v_fa3 = (((uint8_t)(((((uint32_t)(v_fa3)) + ((uint32_t)(v_p.ptr[3]))) / 2))) + v_c.ptr[3]);
+        v_c.ptr[3] = v_fa3;
+        v_c.ptr += 4;
+        v_p.ptr += 4;
+      }
+    }
+  }
+  return wuffs_base__make_empty_struct();
+}
+
 // -------- func png.decoder.filter_4
 
 static wuffs_base__empty_struct
@@ -30229,6 +30300,153 @@
   return wuffs_base__make_empty_struct();
 }
 
+// -------- func png.decoder.filter_4_distance_4_fallback
+
+static wuffs_base__empty_struct
+wuffs_png__decoder__filter_4_distance_4_fallback(
+    wuffs_png__decoder* self,
+    wuffs_base__slice_u8 a_curr,
+    wuffs_base__slice_u8 a_prev) {
+  wuffs_base__slice_u8 v_c = {0};
+  wuffs_base__slice_u8 v_p = {0};
+  uint32_t v_fa0 = 0;
+  uint32_t v_fa1 = 0;
+  uint32_t v_fa2 = 0;
+  uint32_t v_fa3 = 0;
+  uint32_t v_fb0 = 0;
+  uint32_t v_fb1 = 0;
+  uint32_t v_fb2 = 0;
+  uint32_t v_fb3 = 0;
+  uint32_t v_fc0 = 0;
+  uint32_t v_fc1 = 0;
+  uint32_t v_fc2 = 0;
+  uint32_t v_fc3 = 0;
+  uint32_t v_pp0 = 0;
+  uint32_t v_pp1 = 0;
+  uint32_t v_pp2 = 0;
+  uint32_t v_pp3 = 0;
+  uint32_t v_pa0 = 0;
+  uint32_t v_pa1 = 0;
+  uint32_t v_pa2 = 0;
+  uint32_t v_pa3 = 0;
+  uint32_t v_pb0 = 0;
+  uint32_t v_pb1 = 0;
+  uint32_t v_pb2 = 0;
+  uint32_t v_pb3 = 0;
+  uint32_t v_pc0 = 0;
+  uint32_t v_pc1 = 0;
+  uint32_t v_pc2 = 0;
+  uint32_t v_pc3 = 0;
+
+  {
+    wuffs_base__slice_u8 i_slice_c = a_curr;
+    v_c = i_slice_c;
+    wuffs_base__slice_u8 i_slice_p = a_prev;
+    v_p = i_slice_p;
+    i_slice_c.len = wuffs_base__u64__min(i_slice_c.len, i_slice_p.len);
+    v_c.len = 4;
+    v_p.len = 4;
+    uint8_t* i_end0_c = i_slice_c.ptr + (i_slice_c.len / 4) * 4;
+    while (v_c.ptr < i_end0_c) {
+      v_fb0 = ((uint32_t)(v_p.ptr[0]));
+      v_pp0 = ((v_fa0 + v_fb0) - v_fc0);
+      v_pa0 = (v_pp0 - v_fa0);
+      if (v_pa0 >= 2147483648) {
+        v_pa0 = (0 - v_pa0);
+      }
+      v_pb0 = (v_pp0 - v_fb0);
+      if (v_pb0 >= 2147483648) {
+        v_pb0 = (0 - v_pb0);
+      }
+      v_pc0 = (v_pp0 - v_fc0);
+      if (v_pc0 >= 2147483648) {
+        v_pc0 = (0 - v_pc0);
+      }
+      if ((v_pa0 <= v_pb0) && (v_pa0 <= v_pc0)) {
+      } else if (v_pb0 <= v_pc0) {
+        v_fa0 = v_fb0;
+      } else {
+        v_fa0 = v_fc0;
+      }
+      v_c.ptr[0] += ((uint8_t)((v_fa0 & 255)));
+      v_fa0 = ((uint32_t)(v_c.ptr[0]));
+      v_fc0 = v_fb0;
+      v_fb1 = ((uint32_t)(v_p.ptr[1]));
+      v_pp1 = ((v_fa1 + v_fb1) - v_fc1);
+      v_pa1 = (v_pp1 - v_fa1);
+      if (v_pa1 >= 2147483648) {
+        v_pa1 = (0 - v_pa1);
+      }
+      v_pb1 = (v_pp1 - v_fb1);
+      if (v_pb1 >= 2147483648) {
+        v_pb1 = (0 - v_pb1);
+      }
+      v_pc1 = (v_pp1 - v_fc1);
+      if (v_pc1 >= 2147483648) {
+        v_pc1 = (0 - v_pc1);
+      }
+      if ((v_pa1 <= v_pb1) && (v_pa1 <= v_pc1)) {
+      } else if (v_pb1 <= v_pc1) {
+        v_fa1 = v_fb1;
+      } else {
+        v_fa1 = v_fc1;
+      }
+      v_c.ptr[1] += ((uint8_t)((v_fa1 & 255)));
+      v_fa1 = ((uint32_t)(v_c.ptr[1]));
+      v_fc1 = v_fb1;
+      v_fb2 = ((uint32_t)(v_p.ptr[2]));
+      v_pp2 = ((v_fa2 + v_fb2) - v_fc2);
+      v_pa2 = (v_pp2 - v_fa2);
+      if (v_pa2 >= 2147483648) {
+        v_pa2 = (0 - v_pa2);
+      }
+      v_pb2 = (v_pp2 - v_fb2);
+      if (v_pb2 >= 2147483648) {
+        v_pb2 = (0 - v_pb2);
+      }
+      v_pc2 = (v_pp2 - v_fc2);
+      if (v_pc2 >= 2147483648) {
+        v_pc2 = (0 - v_pc2);
+      }
+      if ((v_pa2 <= v_pb2) && (v_pa2 <= v_pc2)) {
+      } else if (v_pb2 <= v_pc2) {
+        v_fa2 = v_fb2;
+      } else {
+        v_fa2 = v_fc2;
+      }
+      v_c.ptr[2] += ((uint8_t)((v_fa2 & 255)));
+      v_fa2 = ((uint32_t)(v_c.ptr[2]));
+      v_fc2 = v_fb2;
+      v_fb3 = ((uint32_t)(v_p.ptr[3]));
+      v_pp3 = ((v_fa3 + v_fb3) - v_fc3);
+      v_pa3 = (v_pp3 - v_fa3);
+      if (v_pa3 >= 2147483648) {
+        v_pa3 = (0 - v_pa3);
+      }
+      v_pb3 = (v_pp3 - v_fb3);
+      if (v_pb3 >= 2147483648) {
+        v_pb3 = (0 - v_pb3);
+      }
+      v_pc3 = (v_pp3 - v_fc3);
+      if (v_pc3 >= 2147483648) {
+        v_pc3 = (0 - v_pc3);
+      }
+      if ((v_pa3 <= v_pb3) && (v_pa3 <= v_pc3)) {
+      } else if (v_pb3 <= v_pc3) {
+        v_fa3 = v_fb3;
+      } else {
+        v_fa3 = v_fc3;
+      }
+      v_c.ptr[3] += ((uint8_t)((v_fa3 & 255)));
+      v_fa3 = ((uint32_t)(v_c.ptr[3]));
+      v_fc3 = v_fb3;
+      v_c.ptr += 4;
+      v_p.ptr += 4;
+    }
+  }
+  return wuffs_base__make_empty_struct();
+}
+
 // -------- func png.decoder.set_quirk_enabled
 
 WUFFS_BASE__MAYBE_STATIC wuffs_base__empty_struct
@@ -30714,6 +30932,8 @@
   if (self->private_impl.f_filter_distance == 3) {
   } else if (self->private_impl.f_filter_distance == 4) {
     self->private_impl.choosy_filter_1 = &wuffs_png__decoder__filter_1_distance_4_fallback;
+    self->private_impl.choosy_filter_3 = &wuffs_png__decoder__filter_3_distance_4_fallback;
+    self->private_impl.choosy_filter_4 = &wuffs_png__decoder__filter_4_distance_4_fallback;
   }
   return wuffs_base__make_empty_struct();
 }
diff --git a/std/png/decode_filter_fallback.wuffs b/std/png/decode_filter_fallback.wuffs
index 972d990..b879ea6 100644
--- a/std/png/decode_filter_fallback.wuffs
+++ b/std/png/decode_filter_fallback.wuffs
@@ -38,21 +38,21 @@
 }
 
 pri func decoder.filter_1_distance_4_fallback!(curr: slice base.u8) {
-	var p   : slice base.u8
+	var c   : slice base.u8
 	var fa0 : base.u8
 	var fa1 : base.u8
 	var fa2 : base.u8
 	var fa3 : base.u8
 
-	iterate (p = args.curr)(length: 4, unroll: 1) {
-		fa0 ~mod+= p[0]
-		p[0] = fa0
-		fa1 ~mod+= p[1]
-		p[1] = fa1
-		fa2 ~mod+= p[2]
-		p[2] = fa2
-		fa3 ~mod+= p[3]
-		p[3] = fa3
+	iterate (c = args.curr)(length: 4, unroll: 1) {
+		fa0 ~mod+= c[0]
+		c[0] = fa0
+		fa1 ~mod+= c[1]
+		c[1] = fa1
+		fa2 ~mod+= c[2]
+		c[2] = fa2
+		fa3 ~mod+= c[3]
+		c[3] = fa3
 	}
 }
 
@@ -131,6 +131,40 @@
 	}
 }
 
+pri func decoder.filter_3_distance_4_fallback!(curr: slice base.u8, prev: slice base.u8) {
+	var c   : slice base.u8
+	var p   : slice base.u8
+	var fa0 : base.u8
+	var fa1 : base.u8
+	var fa2 : base.u8
+	var fa3 : base.u8
+
+	if args.prev.length() == 0 {
+		iterate (c = args.curr)(length: 4, unroll: 1) {
+			fa0 = (fa0 / 2) ~mod+ c[0]
+			c[0] = fa0
+			fa1 = (fa1 / 2) ~mod+ c[1]
+			c[1] = fa1
+			fa2 = (fa2 / 2) ~mod+ c[2]
+			c[2] = fa2
+			fa3 = (fa3 / 2) ~mod+ c[3]
+			c[3] = fa3
+		}
+
+	} else {
+		iterate (c = args.curr, p = args.prev)(length: 4, unroll: 1) {
+			fa0 = ((((fa0 as base.u32) + (p[0] as base.u32)) / 2) as base.u8) ~mod+ c[0]
+			c[0] = fa0
+			fa1 = ((((fa1 as base.u32) + (p[1] as base.u32)) / 2) as base.u8) ~mod+ c[1]
+			c[1] = fa1
+			fa2 = ((((fa2 as base.u32) + (p[2] as base.u32)) / 2) as base.u8) ~mod+ c[2]
+			c[2] = fa2
+			fa3 = ((((fa3 as base.u32) + (p[3] as base.u32)) / 2) as base.u8) ~mod+ c[3]
+			c[3] = fa3
+		}
+	}
+}
+
 pri func decoder.filter_4!(curr: slice base.u8, prev: slice base.u8),
 	choosy,
 {
@@ -200,3 +234,138 @@
 		assert i >= filter_distance via "a >= b: a >= (b + c); 0 <= c"(c: 1)
 	} endwhile
 }
+
+pri func decoder.filter_4_distance_4_fallback!(curr: slice base.u8, prev: slice base.u8) {
+	var c   : slice base.u8
+	var p   : slice base.u8
+	var fa0 : base.u32
+	var fa1 : base.u32
+	var fa2 : base.u32
+	var fa3 : base.u32
+	var fb0 : base.u32
+	var fb1 : base.u32
+	var fb2 : base.u32
+	var fb3 : base.u32
+	var fc0 : base.u32
+	var fc1 : base.u32
+	var fc2 : base.u32
+	var fc3 : base.u32
+	var pp0 : base.u32
+	var pp1 : base.u32
+	var pp2 : base.u32
+	var pp3 : base.u32
+	var pa0 : base.u32
+	var pa1 : base.u32
+	var pa2 : base.u32
+	var pa3 : base.u32
+	var pb0 : base.u32
+	var pb1 : base.u32
+	var pb2 : base.u32
+	var pb3 : base.u32
+	var pc0 : base.u32
+	var pc1 : base.u32
+	var pc2 : base.u32
+	var pc3 : base.u32
+
+	iterate (c = args.curr, p = args.prev)(length: 4, unroll: 1) {
+		fb0 = p[0] as base.u32
+		pp0 = (fa0 ~mod+ fb0) ~mod- fc0
+		pa0 = pp0 ~mod- fa0
+		if pa0 >= 0x8000_0000 {
+			pa0 = 0 ~mod- pa0
+		}
+		pb0 = pp0 ~mod- fb0
+		if pb0 >= 0x8000_0000 {
+			pb0 = 0 ~mod- pb0
+		}
+		pc0 = pp0 ~mod- fc0
+		if pc0 >= 0x8000_0000 {
+			pc0 = 0 ~mod- pc0
+		}
+		if (pa0 <= pb0) and (pa0 <= pc0) {
+			// No-op.
+		} else if pb0 <= pc0 {
+			fa0 = fb0
+		} else {
+			fa0 = fc0
+		}
+		c[0] ~mod+= (fa0 & 0xFF) as base.u8
+		fa0 = c[0] as base.u32
+		fc0 = fb0
+
+		fb1 = p[1] as base.u32
+		pp1 = (fa1 ~mod+ fb1) ~mod- fc1
+		pa1 = pp1 ~mod- fa1
+		if pa1 >= 0x8000_0000 {
+			pa1 = 0 ~mod- pa1
+		}
+		pb1 = pp1 ~mod- fb1
+		if pb1 >= 0x8000_0000 {
+			pb1 = 0 ~mod- pb1
+		}
+		pc1 = pp1 ~mod- fc1
+		if pc1 >= 0x8000_0000 {
+			pc1 = 0 ~mod- pc1
+		}
+		if (pa1 <= pb1) and (pa1 <= pc1) {
+			// No-op.
+		} else if pb1 <= pc1 {
+			fa1 = fb1
+		} else {
+			fa1 = fc1
+		}
+		c[1] ~mod+= (fa1 & 0xFF) as base.u8
+		fa1 = c[1] as base.u32
+		fc1 = fb1
+
+		fb2 = p[2] as base.u32
+		pp2 = (fa2 ~mod+ fb2) ~mod- fc2
+		pa2 = pp2 ~mod- fa2
+		if pa2 >= 0x8000_0000 {
+			pa2 = 0 ~mod- pa2
+		}
+		pb2 = pp2 ~mod- fb2
+		if pb2 >= 0x8000_0000 {
+			pb2 = 0 ~mod- pb2
+		}
+		pc2 = pp2 ~mod- fc2
+		if pc2 >= 0x8000_0000 {
+			pc2 = 0 ~mod- pc2
+		}
+		if (pa2 <= pb2) and (pa2 <= pc2) {
+			// No-op.
+		} else if pb2 <= pc2 {
+			fa2 = fb2
+		} else {
+			fa2 = fc2
+		}
+		c[2] ~mod+= (fa2 & 0xFF) as base.u8
+		fa2 = c[2] as base.u32
+		fc2 = fb2
+
+		fb3 = p[3] as base.u32
+		pp3 = (fa3 ~mod+ fb3) ~mod- fc3
+		pa3 = pp3 ~mod- fa3
+		if pa3 >= 0x8000_0000 {
+			pa3 = 0 ~mod- pa3
+		}
+		pb3 = pp3 ~mod- fb3
+		if pb3 >= 0x8000_0000 {
+			pb3 = 0 ~mod- pb3
+		}
+		pc3 = pp3 ~mod- fc3
+		if pc3 >= 0x8000_0000 {
+			pc3 = 0 ~mod- pc3
+		}
+		if (pa3 <= pb3) and (pa3 <= pc3) {
+			// No-op.
+		} else if pb3 <= pc3 {
+			fa3 = fb3
+		} else {
+			fa3 = fc3
+		}
+		c[3] ~mod+= (fa3 & 0xFF) as base.u8
+		fa3 = c[3] as base.u32
+		fc3 = fb3
+	}
+}
diff --git a/std/png/decode_png.wuffs b/std/png/decode_png.wuffs
index 8a89283..79d3100 100644
--- a/std/png/decode_png.wuffs
+++ b/std/png/decode_png.wuffs
@@ -246,10 +246,14 @@
 }
 
 pri func decoder.choose_filter_implementations!() {
+	// Filter 0 is a no-op. Filter 2, the up filter, should already vectorize
+	// easily by a good optimizing C compiler.
 	if this.filter_distance == 3 {
 		// TODO.
 	} else if this.filter_distance == 4 {
 		choose filter_1 = [filter_1_distance_4_fallback]
+		choose filter_3 = [filter_3_distance_4_fallback]
+		choose filter_4 = [filter_4_distance_4_fallback]
 	}
 }