Unroll some std/png sse128 loops

wuffs_png_decode_filt_1_dist_3/clang9                     1.57GB/s ± 0%  1.57GB/s ± 0%     ~     (p=0.151 n=5+5)
wuffs_png_decode_filt_1_dist_4/clang9                     4.62GB/s ± 0%  5.87GB/s ± 1%  +26.93%  (p=0.008 n=5+5)
wuffs_png_decode_filt_2_dist_3/clang9                     13.3GB/s ± 0%  13.1GB/s ± 1%   -1.52%  (p=0.008 n=5+5)
wuffs_png_decode_filt_2_dist_4/clang9                     13.3GB/s ± 0%  13.1GB/s ± 1%   -1.50%  (p=0.008 n=5+5)
wuffs_png_decode_filt_3_dist_3/clang9                      910MB/s ± 0%   909MB/s ± 0%     ~     (p=0.095 n=5+5)
wuffs_png_decode_filt_3_dist_4/clang9                     1.78GB/s ± 0%  1.84GB/s ± 0%   +3.65%  (p=0.008 n=5+5)
wuffs_png_decode_filt_4_dist_3/clang9                      542MB/s ± 0%   571MB/s ± 0%   +5.31%  (p=0.016 n=4+5)
wuffs_png_decode_filt_4_dist_4/clang9                      844MB/s ± 0%   819MB/s ± 0%   -3.00%  (p=0.008 n=5+5)

wuffs_png_decode_image_40k_24bpp/clang9                    127MB/s ± 0%   127MB/s ± 0%     ~     (p=0.310 n=5+5)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/clang9   326MB/s ± 0%   322MB/s ± 0%   -1.19%  (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/clang9   259MB/s ± 0%   256MB/s ± 0%   -1.31%  (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/clang9                  128MB/s ± 0%   128MB/s ± 0%     ~     (p=0.278 n=5+5)

wuffs_png_decode_filt_1_dist_3/gcc10                      1.84GB/s ± 0%  1.84GB/s ± 0%     ~     (p=0.190 n=4+5)
wuffs_png_decode_filt_1_dist_4/gcc10                      3.38GB/s ± 1%  5.41GB/s ± 1%  +59.98%  (p=0.008 n=5+5)
wuffs_png_decode_filt_2_dist_3/gcc10                      8.29GB/s ± 0%  8.30GB/s ± 0%     ~     (p=0.222 n=5+5)
wuffs_png_decode_filt_2_dist_4/gcc10                      8.28GB/s ± 0%  8.30GB/s ± 0%   +0.24%  (p=0.008 n=5+5)
wuffs_png_decode_filt_3_dist_3/gcc10                      1.07GB/s ± 0%  1.08GB/s ± 0%     ~     (p=0.421 n=5+5)
wuffs_png_decode_filt_3_dist_4/gcc10                      2.13GB/s ± 0%  2.36GB/s ± 0%  +10.74%  (p=0.008 n=5+5)
wuffs_png_decode_filt_4_dist_3/gcc10                       587MB/s ± 0%   616MB/s ± 0%   +4.95%  (p=0.016 n=4+5)
wuffs_png_decode_filt_4_dist_4/gcc10                       796MB/s ± 0%   871MB/s ± 0%   +9.44%  (p=0.008 n=5+5)

wuffs_png_decode_image_40k_24bpp/gcc10                     130MB/s ± 0%   131MB/s ± 0%   +0.78%  (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/gcc10    320MB/s ± 0%   325MB/s ± 0%   +1.69%  (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/gcc10    270MB/s ± 0%   273MB/s ± 0%   +1.30%  (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/gcc10                   131MB/s ± 0%   131MB/s ± 0%   +0.40%  (p=0.008 n=5+5)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 3d8aff4..4fb5fea 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -31102,13 +31102,27 @@
     wuffs_base__slice_u8 i_slice_c = a_curr;
     v_c.ptr = i_slice_c.ptr;
     v_c.len = 4;
-    uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
+    uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 8) * 8);
     while (v_c.ptr < i_end0_c) {
       v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
       v_x128 = _mm_add_epi8(v_x128, v_a128);
       v_a128 = v_x128;
       wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
       v_c.ptr += 4;
+      v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+      v_x128 = _mm_add_epi8(v_x128, v_a128);
+      v_a128 = v_x128;
+      wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+      v_c.ptr += 4;
+    }
+    v_c.len = 4;
+    uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
+    while (v_c.ptr < i_end1_c) {
+      v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+      v_x128 = _mm_add_epi8(v_x128, v_a128);
+      v_a128 = v_x128;
+      wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+      v_c.ptr += 4;
     }
     v_c.len = 0;
   }
@@ -31141,7 +31155,7 @@
       wuffs_base__slice_u8 i_slice_c = a_curr;
       v_c.ptr = i_slice_c.ptr;
       v_c.len = 4;
-      uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
+      uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 8) * 8);
       while (v_c.ptr < i_end0_c) {
         v_p128 = _mm_avg_epu8(_mm_and_si128(v_a128, v_k128), v_b128);
         v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
@@ -31149,6 +31163,22 @@
         v_a128 = v_x128;
         wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
         v_c.ptr += 4;
+        v_p128 = _mm_avg_epu8(_mm_and_si128(v_a128, v_k128), v_b128);
+        v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+        v_x128 = _mm_add_epi8(v_x128, v_p128);
+        v_a128 = v_x128;
+        wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+        v_c.ptr += 4;
+      }
+      v_c.len = 4;
+      uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
+      while (v_c.ptr < i_end1_c) {
+        v_p128 = _mm_avg_epu8(_mm_and_si128(v_a128, v_k128), v_b128);
+        v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+        v_x128 = _mm_add_epi8(v_x128, v_p128);
+        v_a128 = v_x128;
+        wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+        v_c.ptr += 4;
       }
       v_c.len = 0;
     }
@@ -31162,7 +31192,7 @@
       i_slice_c.len = ((size_t)(wuffs_base__u64__min(i_slice_c.len, i_slice_p.len)));
       v_c.len = 4;
       v_p.len = 4;
-      uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
+      uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 8) * 8);
       while (v_c.ptr < i_end0_c) {
         v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
         v_p128 = _mm_avg_epu8(v_a128, v_b128);
@@ -31173,6 +31203,29 @@
         wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
         v_c.ptr += 4;
         v_p.ptr += 4;
+        v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+        v_p128 = _mm_avg_epu8(v_a128, v_b128);
+        v_p128 = _mm_sub_epi8(v_p128, _mm_and_si128(v_k128, _mm_xor_si128(v_a128, v_b128)));
+        v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+        v_x128 = _mm_add_epi8(v_x128, v_p128);
+        v_a128 = v_x128;
+        wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+        v_c.ptr += 4;
+        v_p.ptr += 4;
+      }
+      v_c.len = 4;
+      v_p.len = 4;
+      uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
+      while (v_c.ptr < i_end1_c) {
+        v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+        v_p128 = _mm_avg_epu8(v_a128, v_b128);
+        v_p128 = _mm_sub_epi8(v_p128, _mm_and_si128(v_k128, _mm_xor_si128(v_a128, v_b128)));
+        v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+        v_x128 = _mm_add_epi8(v_x128, v_p128);
+        v_a128 = v_x128;
+        wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+        v_c.ptr += 4;
+        v_p.ptr += 4;
       }
       v_c.len = 0;
       v_p.len = 0;
@@ -31214,7 +31267,7 @@
     i_slice_c.len = ((size_t)(wuffs_base__u64__min(i_slice_c.len, i_slice_p.len)));
     v_c.len = 4;
     v_p.len = 4;
-    uint8_t* i_end0_c = v_c.ptr + wuffs_base__iterate_total_advance((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)), 4, 3);
+    uint8_t* i_end0_c = v_c.ptr + wuffs_base__iterate_total_advance((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)), 7, 6);
     while (v_c.ptr < i_end0_c) {
       v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
       v_b128 = _mm_unpacklo_epi8(v_b128, v_z128);
@@ -31235,11 +31288,54 @@
       wuffs_base__poke_u24le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
       v_c.ptr += 3;
       v_p.ptr += 3;
+      v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+      v_b128 = _mm_unpacklo_epi8(v_b128, v_z128);
+      v_pa128 = _mm_sub_epi16(v_b128, v_c128);
+      v_pb128 = _mm_sub_epi16(v_a128, v_c128);
+      v_pc128 = _mm_add_epi16(v_pa128, v_pb128);
+      v_pa128 = _mm_abs_epi16(v_pa128);
+      v_pb128 = _mm_abs_epi16(v_pb128);
+      v_pc128 = _mm_abs_epi16(v_pc128);
+      v_smallest128 = _mm_min_epi16(v_pc128, _mm_min_epi16(v_pb128, v_pa128));
+      v_p128 = _mm_blendv_epi8(_mm_blendv_epi8(v_c128, v_b128, _mm_cmpeq_epi16(v_smallest128, v_pb128)), v_a128, _mm_cmpeq_epi16(v_smallest128, v_pa128));
+      v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+      v_x128 = _mm_unpacklo_epi8(v_x128, v_z128);
+      v_x128 = _mm_add_epi8(v_x128, v_p128);
+      v_a128 = v_x128;
+      v_c128 = v_b128;
+      v_x128 = _mm_packus_epi16(v_x128, v_x128);
+      wuffs_base__poke_u24le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+      v_c.ptr += 3;
+      v_p.ptr += 3;
+    }
+    v_c.len = 4;
+    v_p.len = 4;
+    uint8_t* i_end1_c = v_c.ptr + wuffs_base__iterate_total_advance((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)), 4, 3);
+    while (v_c.ptr < i_end1_c) {
+      v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+      v_b128 = _mm_unpacklo_epi8(v_b128, v_z128);
+      v_pa128 = _mm_sub_epi16(v_b128, v_c128);
+      v_pb128 = _mm_sub_epi16(v_a128, v_c128);
+      v_pc128 = _mm_add_epi16(v_pa128, v_pb128);
+      v_pa128 = _mm_abs_epi16(v_pa128);
+      v_pb128 = _mm_abs_epi16(v_pb128);
+      v_pc128 = _mm_abs_epi16(v_pc128);
+      v_smallest128 = _mm_min_epi16(v_pc128, _mm_min_epi16(v_pb128, v_pa128));
+      v_p128 = _mm_blendv_epi8(_mm_blendv_epi8(v_c128, v_b128, _mm_cmpeq_epi16(v_smallest128, v_pb128)), v_a128, _mm_cmpeq_epi16(v_smallest128, v_pa128));
+      v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+      v_x128 = _mm_unpacklo_epi8(v_x128, v_z128);
+      v_x128 = _mm_add_epi8(v_x128, v_p128);
+      v_a128 = v_x128;
+      v_c128 = v_b128;
+      v_x128 = _mm_packus_epi16(v_x128, v_x128);
+      wuffs_base__poke_u24le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+      v_c.ptr += 3;
+      v_p.ptr += 3;
     }
     v_c.len = 3;
     v_p.len = 3;
-    uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3);
-    while (v_c.ptr < i_end1_c) {
+    uint8_t* i_end2_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3);
+    while (v_c.ptr < i_end2_c) {
       v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u24le__no_bounds_check(v_p.ptr)));
       v_b128 = _mm_unpacklo_epi8(v_b128, v_z128);
       v_pa128 = _mm_sub_epi16(v_b128, v_c128);
@@ -31297,7 +31393,7 @@
     i_slice_c.len = ((size_t)(wuffs_base__u64__min(i_slice_c.len, i_slice_p.len)));
     v_c.len = 4;
     v_p.len = 4;
-    uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
+    uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 8) * 8);
     while (v_c.ptr < i_end0_c) {
       v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
       v_b128 = _mm_unpacklo_epi8(v_b128, v_z128);
@@ -31318,6 +31414,49 @@
       wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
       v_c.ptr += 4;
       v_p.ptr += 4;
+      v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+      v_b128 = _mm_unpacklo_epi8(v_b128, v_z128);
+      v_pa128 = _mm_sub_epi16(v_b128, v_c128);
+      v_pb128 = _mm_sub_epi16(v_a128, v_c128);
+      v_pc128 = _mm_add_epi16(v_pa128, v_pb128);
+      v_pa128 = _mm_abs_epi16(v_pa128);
+      v_pb128 = _mm_abs_epi16(v_pb128);
+      v_pc128 = _mm_abs_epi16(v_pc128);
+      v_smallest128 = _mm_min_epi16(v_pc128, _mm_min_epi16(v_pb128, v_pa128));
+      v_p128 = _mm_blendv_epi8(_mm_blendv_epi8(v_c128, v_b128, _mm_cmpeq_epi16(v_smallest128, v_pb128)), v_a128, _mm_cmpeq_epi16(v_smallest128, v_pa128));
+      v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+      v_x128 = _mm_unpacklo_epi8(v_x128, v_z128);
+      v_x128 = _mm_add_epi8(v_x128, v_p128);
+      v_a128 = v_x128;
+      v_c128 = v_b128;
+      v_x128 = _mm_packus_epi16(v_x128, v_x128);
+      wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+      v_c.ptr += 4;
+      v_p.ptr += 4;
+    }
+    v_c.len = 4;
+    v_p.len = 4;
+    uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
+    while (v_c.ptr < i_end1_c) {
+      v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+      v_b128 = _mm_unpacklo_epi8(v_b128, v_z128);
+      v_pa128 = _mm_sub_epi16(v_b128, v_c128);
+      v_pb128 = _mm_sub_epi16(v_a128, v_c128);
+      v_pc128 = _mm_add_epi16(v_pa128, v_pb128);
+      v_pa128 = _mm_abs_epi16(v_pa128);
+      v_pb128 = _mm_abs_epi16(v_pb128);
+      v_pc128 = _mm_abs_epi16(v_pc128);
+      v_smallest128 = _mm_min_epi16(v_pc128, _mm_min_epi16(v_pb128, v_pa128));
+      v_p128 = _mm_blendv_epi8(_mm_blendv_epi8(v_c128, v_b128, _mm_cmpeq_epi16(v_smallest128, v_pb128)), v_a128, _mm_cmpeq_epi16(v_smallest128, v_pa128));
+      v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+      v_x128 = _mm_unpacklo_epi8(v_x128, v_z128);
+      v_x128 = _mm_add_epi8(v_x128, v_p128);
+      v_a128 = v_x128;
+      v_c128 = v_b128;
+      v_x128 = _mm_packus_epi16(v_x128, v_x128);
+      wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
+      v_c.ptr += 4;
+      v_p.ptr += 4;
     }
     v_c.len = 0;
     v_p.len = 0;
diff --git a/std/png/decode_filter_sse128.wuffs b/std/png/decode_filter_sse128.wuffs
index 5d447d5..1e911f8 100644
--- a/std/png/decode_filter_sse128.wuffs
+++ b/std/png/decode_filter_sse128.wuffs
@@ -26,7 +26,7 @@
 //     var x128 : base.sse128_i
 //     var a128 : base.sse128_i
 //
-//     iterate (c = args.curr)(length: 4, advance: 3, unroll: 1) {
+//     iterate (c = args.curr)(length: 4, advance: 3, unroll: 2) {
 //         x128.load_u32!(a: c.peek_u32le())
 //         x128 = x128._mm_add_epi8!(b: a128)
 //         a128 = x128
@@ -45,7 +45,7 @@
 	var x128 : base.sse128_i
 	var a128 : base.sse128_i
 
-	iterate (c = args.curr)(length: 4, advance: 4, unroll: 1) {
+	iterate (c = args.curr)(length: 4, advance: 4, unroll: 2) {
 		x128.load_u32!(a: c.peek_u32le())
 		x128 = x128._mm_add_epi8(b: a128)
 		a128 = x128
@@ -79,7 +79,7 @@
 
 	if args.prev.length() == 0 {
 		k128 = k128.create_mm_set1_epi8(a: 0xFE)
-		iterate (c = args.curr)(length: 4, advance: 4, unroll: 1) {
+		iterate (c = args.curr)(length: 4, advance: 4, unroll: 2) {
 			// The predictor, p128, is just half (rounded down) of the previous
 			// pixel, a128. In this branch, b128 stays zero so the average of
 			// a128 and b128 is just half of a128. _mm_avg_epu8 rounds up, but
@@ -96,7 +96,7 @@
 
 	} else {
 		k128 = k128.create_mm_set1_epi8(a: 0x01)
-		iterate (c = args.curr, p = args.prev)(length: 4, advance: 4, unroll: 1) {
+		iterate (c = args.curr, p = args.prev)(length: 4, advance: 4, unroll: 2) {
 			// Load the pixel from the row above.
 			b128.load_u32!(a: p.peek_u32le())
 
@@ -147,7 +147,7 @@
 	var z128        : base.sse128_i
 
 	// § The advance is 3, not 4.
-	iterate (c = args.curr, p = args.prev)(length: 4, advance: 3, unroll: 1) {
+	iterate (c = args.curr, p = args.prev)(length: 4, advance: 3, unroll: 2) {
 		b128.load_u32!(a: p.peek_u32le())
 		b128 = b128._mm_unpacklo_epi8(b: z128)
 		pa128 = b128._mm_sub_epi16(b: c128)
@@ -217,7 +217,7 @@
 	var smallest128 : base.sse128_i
 	var z128        : base.sse128_i
 
-	iterate (c = args.curr, p = args.prev)(length: 4, advance: 4, unroll: 1) {
+	iterate (c = args.curr, p = args.prev)(length: 4, advance: 4, unroll: 2) {
 		// Load the pixel from the row above.
 		b128.load_u32!(a: p.peek_u32le())