std/jpeg: prepare for triangle-filter upsampling
diff --git a/fuzz/c/std/pixel_swizzler_fuzzer.c b/fuzz/c/std/pixel_swizzler_fuzzer.c
index aafe60a..a73a462 100644
--- a/fuzz/c/std/pixel_swizzler_fuzzer.c
+++ b/fuzz/c/std/pixel_swizzler_fuzzer.c
@@ -234,7 +234,7 @@
   uint32_t v0 = possible_hv_values[allow_hv3][3 & (hash >> 31)];
   uint32_t v1 = possible_hv_values[allow_hv3][3 & (hash >> 33)];
   uint32_t v2 = possible_hv_values[allow_hv3][3 & (hash >> 35)];
-  // TODO: spend a hash bit for triangle_filter_for_2to1.
+  bool triangle_filter_for_2to1 = 1 & (hash >> 37);
 
   uint32_t width0 = 8 * width_in_mcus * h0;
   uint32_t width1 = 8 * width_in_mcus * h1;
@@ -339,6 +339,7 @@
   wuffs_base__slice_u8 src3 = wuffs_base__empty_slice_u8();
 
   wuffs_base__pixel_swizzler swizzler = {0};
+  uint8_t scratch_buffer[2048];
   wuffs_base__status status = wuffs_base__pixel_swizzler__swizzle_ycck(
       &swizzler, &dst_pixbuf, dst_palette,  //
       width, height,                        //
@@ -348,7 +349,8 @@
       width0, width1, width2, 0,            //
       h0, h1, h2, 0,                        //
       v0, v1, v2, 0,                        //
-      false);
+      triangle_filter_for_2to1,             //
+      wuffs_base__make_slice_u8(scratch_buffer, sizeof(scratch_buffer)));
   if (status.repr) {
     return wuffs_base__status__message(&status);
   }
diff --git a/internal/cgen/base/image-private.h b/internal/cgen/base/image-private.h
index 7d3f01d..83807ae 100644
--- a/internal/cgen/base/image-private.h
+++ b/internal/cgen/base/image-private.h
@@ -39,36 +39,38 @@
     uint64_t num_pixels);
 
 WUFFS_BASE__MAYBE_STATIC wuffs_base__status  //
-wuffs_base__pixel_swizzler__swizzle_ycck(const wuffs_base__pixel_swizzler* p,
-                                         wuffs_base__pixel_buffer* dst,
-                                         wuffs_base__slice_u8 dst_palette,
-                                         uint32_t width,
-                                         uint32_t height,
-                                         wuffs_base__slice_u8 src0,
-                                         wuffs_base__slice_u8 src1,
-                                         wuffs_base__slice_u8 src2,
-                                         wuffs_base__slice_u8 src3,
-                                         uint32_t width0,
-                                         uint32_t width1,
-                                         uint32_t width2,
-                                         uint32_t width3,
-                                         uint32_t height0,
-                                         uint32_t height1,
-                                         uint32_t height2,
-                                         uint32_t height3,
-                                         uint32_t stride0,
-                                         uint32_t stride1,
-                                         uint32_t stride2,
-                                         uint32_t stride3,
-                                         uint8_t h0,
-                                         uint8_t h1,
-                                         uint8_t h2,
-                                         uint8_t h3,
-                                         uint8_t v0,
-                                         uint8_t v1,
-                                         uint8_t v2,
-                                         uint8_t v3,
-                                         bool triangle_filter_for_2to1);
+wuffs_base__pixel_swizzler__swizzle_ycck(
+    const wuffs_base__pixel_swizzler* p,
+    wuffs_base__pixel_buffer* dst,
+    wuffs_base__slice_u8 dst_palette,
+    uint32_t width,
+    uint32_t height,
+    wuffs_base__slice_u8 src0,
+    wuffs_base__slice_u8 src1,
+    wuffs_base__slice_u8 src2,
+    wuffs_base__slice_u8 src3,
+    uint32_t width0,
+    uint32_t width1,
+    uint32_t width2,
+    uint32_t width3,
+    uint32_t height0,
+    uint32_t height1,
+    uint32_t height2,
+    uint32_t height3,
+    uint32_t stride0,
+    uint32_t stride1,
+    uint32_t stride2,
+    uint32_t stride3,
+    uint8_t h0,
+    uint8_t h1,
+    uint8_t h2,
+    uint8_t h3,
+    uint8_t v0,
+    uint8_t v1,
+    uint8_t v2,
+    uint8_t v3,
+    bool triangle_filter_for_2to1,
+    wuffs_base__slice_u8 scratch_buffer_2k);
 
 // ---------------- Images (Utility)
 
diff --git a/internal/cgen/base/pixconv-submodule-ycck.c b/internal/cgen/base/pixconv-submodule-ycck.c
index 6705692..004b1e2 100644
--- a/internal/cgen/base/pixconv-submodule-ycck.c
+++ b/internal/cgen/base/pixconv-submodule-ycck.c
@@ -34,14 +34,241 @@
       e);
 }
 
-// Preconditions: see all the checks made in
-// wuffs_base__pixel_swizzler__swizzle_ycck before calling this function. For
+// --------
+
+// wuffs_base__pixel_swizzler__swizzle_ycc__upsample_func upsamples to a
+// destination slice at least 480 (YCCK) or 672 (YCC) bytes long and whose
+// src_len (multiplied by 1, 2, 3 or 4) is positive but no more than that. This
+// 480 or 672 length is just under 1/4 or 1/3 of the scratch_buffer_2k slice
+// length. Both (480 * 4) = 1920 and (672 * 3) = 2016 are less than 2048.
+//
+// 480 and 672 are nice round numbers because a JPEG MCU is 1, 2, 3 or 4 blocks
+// wide and each block is 8 pixels wide. We have:
+//   480 = 1 * 8 * 60,   672 = 1 * 8 * 84
+//   480 = 2 * 8 * 30,   672 = 2 * 8 * 42
+//   480 = 3 * 8 * 20,   672 = 3 * 8 * 28
+//   480 = 4 * 8 * 15,   672 = 4 * 8 * 21
+//
+// Box filters are equivalent to nearest neighbor upsampling. These ignore the
+// src_ptr_minor, h1v2_bias, first_column and last_column arguments.
+//
+// TODO: triangle filters.
+typedef const uint8_t* (
+    *wuffs_base__pixel_swizzler__swizzle_ycc__upsample_func)(
+    uint8_t* dst_ptr,
+    const uint8_t* src_ptr_major,  // Nearest row.
+    const uint8_t* src_ptr_minor,  // Adjacent row, alternating above or below.
+    size_t src_len,
+    uint32_t h1v2_bias,
+    bool first_column,
+    bool last_column);
+
+static const uint8_t*  //
+wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h1vn_box(
+    uint8_t* dst_ptr,
+    const uint8_t* src_ptr_major,
+    const uint8_t* src_ptr_minor_ignored,
+    size_t src_len,
+    uint32_t h1v2_bias_ignored,
+    bool first_column_ignored,
+    bool last_column_ignored) {
+  return src_ptr_major;
+}
+
+static const uint8_t*  //
+wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2vn_box(
+    uint8_t* dst_ptr,
+    const uint8_t* src_ptr_major,
+    const uint8_t* src_ptr_minor_ignored,
+    size_t src_len,
+    uint32_t h1v2_bias_ignored,
+    bool first_column_ignored,
+    bool last_column_ignored) {
+  uint8_t* dp = dst_ptr;
+  const uint8_t* sp = src_ptr_major;
+  while (src_len--) {
+    uint8_t sv = *sp++;
+    *dp++ = sv;
+    *dp++ = sv;
+  }
+  return dst_ptr;
+}
+
+static const uint8_t*  //
+wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h3vn_box(
+    uint8_t* dst_ptr,
+    const uint8_t* src_ptr_major,
+    const uint8_t* src_ptr_minor_ignored,
+    size_t src_len,
+    uint32_t h1v2_bias_ignored,
+    bool first_column_ignored,
+    bool last_column_ignored) {
+  uint8_t* dp = dst_ptr;
+  const uint8_t* sp = src_ptr_major;
+  while (src_len--) {
+    uint8_t sv = *sp++;
+    *dp++ = sv;
+    *dp++ = sv;
+    *dp++ = sv;
+  }
+  return dst_ptr;
+}
+
+static const uint8_t*  //
+wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h4vn_box(
+    uint8_t* dst_ptr,
+    const uint8_t* src_ptr_major,
+    const uint8_t* src_ptr_minor_ignored,
+    size_t src_len,
+    uint32_t h1v2_bias_ignored,
+    bool first_column_ignored,
+    bool last_column_ignored) {
+  uint8_t* dp = dst_ptr;
+  const uint8_t* sp = src_ptr_major;
+  while (src_len--) {
+    uint8_t sv = *sp++;
+    *dp++ = sv;
+    *dp++ = sv;
+    *dp++ = sv;
+    *dp++ = sv;
+  }
+  return dst_ptr;
+}
+
+// wuffs_base__pixel_swizzler__swizzle_ycc__upsample_funcs is indexed by inv_h
+// and then inv_v.
+static const wuffs_base__pixel_swizzler__swizzle_ycc__upsample_func
+    wuffs_base__pixel_swizzler__swizzle_ycc__upsample_funcs[4][4] = {
+        {
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h1vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h1vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h1vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h1vn_box,
+        },
+        {
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2vn_box,
+        },
+        {
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h3vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h3vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h3vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h3vn_box,
+        },
+        {
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h4vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h4vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h4vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h4vn_box,
+        },
+};
+
+static inline uint32_t  //
+wuffs_base__pixel_swizzler__has_triangle_upsampler(uint32_t inv_h,
+                                                   uint32_t inv_v) {
+  if (inv_h == 1u) {
+    return inv_v == 2u;
+  } else if (inv_h == 2u) {
+    return (inv_v == 1u) || (inv_v == 2u);
+  }
+  return false;
+}
+
+// --------
+
+// All of the wuffs_base__pixel_swizzler__swizzle_ycc__etc functions have
+// preconditions. See all of the checks made in
+// wuffs_base__pixel_swizzler__swizzle_ycck before calling these functions. For
 // example, (width > 0) is a precondition, but there are many more.
+
 static void  //
-wuffs_base__pixel_swizzler__swizzle_ycc__general__box_filter(
-    const wuffs_base__pixel_swizzler* p,
+wuffs_base__pixel_swizzler__swizzle_ycc__general__triangle_filter_single_row(
     wuffs_base__pixel_buffer* dst,
-    wuffs_base__slice_u8 dst_palette,
+    uint32_t width,
+    uint32_t y,
+    const uint8_t* src_ptr0,
+    const uint8_t* src_ptr1,
+    const uint8_t* src_ptr2,
+    uint32_t stride0,
+    uint32_t stride1,
+    uint32_t stride2,
+    uint32_t inv_h0,
+    uint32_t inv_h1,
+    uint32_t inv_h2,
+    uint32_t inv_v0,
+    uint32_t inv_v1,
+    uint32_t inv_v2,
+    uint32_t half_width_for_2to1,
+    uint32_t h1v2_bias,
+    uint8_t* scratch_buffer_2k_ptr,
+    wuffs_base__pixel_swizzler__swizzle_ycc__upsample_func upfunc0,
+    wuffs_base__pixel_swizzler__swizzle_ycc__upsample_func upfunc1,
+    wuffs_base__pixel_swizzler__swizzle_ycc__upsample_func upfunc2) {
+  const uint8_t* src0 = src_ptr0 + ((y / inv_v0) * (size_t)stride0);
+  const uint8_t* src1 = src_ptr1 + ((y / inv_v1) * (size_t)stride1);
+  const uint8_t* src2 = src_ptr2 + ((y / inv_v2) * (size_t)stride2);
+  uint32_t total_src_len0 = 0u;
+  uint32_t total_src_len1 = 0u;
+  uint32_t total_src_len2 = 0u;
+
+  uint32_t x = 0u;
+  while (x < width) {
+    bool first_column = x == 0u;
+    uint32_t end = x + 672u;
+    if (end > width) {
+      end = width;
+    }
+
+    uint32_t src_len0 = ((end - x) + inv_h0 - 1u) / inv_h0;
+    uint32_t src_len1 = ((end - x) + inv_h1 - 1u) / inv_h1;
+    uint32_t src_len2 = ((end - x) + inv_h2 - 1u) / inv_h2;
+    total_src_len0 += src_len0;
+    total_src_len1 += src_len1;
+    total_src_len2 += src_len2;
+
+    const uint8_t* src_ptr_x0 = src0 + (x / inv_h0);
+    const uint8_t* up0 = (*upfunc0)(          //
+        scratch_buffer_2k_ptr + (0u * 672u),  //
+        src_ptr_x0,                           //
+        src_ptr_x0,                           //
+        src_len0,                             //
+        h1v2_bias,                            //
+        first_column,                         //
+        (total_src_len0 >= half_width_for_2to1));
+
+    const uint8_t* src_ptr_x1 = src1 + (x / inv_h1);
+    const uint8_t* up1 = (*upfunc1)(          //
+        scratch_buffer_2k_ptr + (1u * 672u),  //
+        src_ptr_x1,                           //
+        src_ptr_x1,                           //
+        src_len1,                             //
+        h1v2_bias,                            //
+        first_column,                         //
+        (total_src_len1 >= half_width_for_2to1));
+
+    const uint8_t* src_ptr_x2 = src2 + (x / inv_h2);
+    const uint8_t* up2 = (*upfunc2)(          //
+        scratch_buffer_2k_ptr + (2u * 672u),  //
+        src_ptr_x2,                           //
+        src_ptr_x2,                           //
+        src_len2,                             //
+        h1v2_bias,                            //
+        first_column,                         //
+        (total_src_len2 >= half_width_for_2to1));
+
+    for (; x < end; x++) {
+      wuffs_base__pixel_buffer__set_color_u32_at(
+          dst, x, y,
+          wuffs_base__color_ycc__as__color_u32(*up0++, *up1++, *up2++));
+    }
+  }
+}
+
+static void  //
+wuffs_base__pixel_swizzler__swizzle_ycc__general__triangle_filter(
+    wuffs_base__pixel_buffer* dst,
     uint32_t width,
     uint32_t height,
     const uint8_t* src_ptr0,
@@ -50,74 +277,129 @@
     uint32_t stride0,
     uint32_t stride1,
     uint32_t stride2,
-    uint32_t h0_out_of_12,
-    uint32_t h1_out_of_12,
-    uint32_t h2_out_of_12,
-    uint32_t v0_out_of_12,
-    uint32_t v1_out_of_12,
-    uint32_t v2_out_of_12) {
-  uint32_t iy0 = 0;
-  uint32_t iy1 = 0;
-  uint32_t iy2 = 0;
-  uint32_t y = 0;
+    uint32_t inv_h0,
+    uint32_t inv_h1,
+    uint32_t inv_h2,
+    uint32_t inv_v0,
+    uint32_t inv_v1,
+    uint32_t inv_v2,
+    uint32_t half_width_for_2to1,
+    uint32_t half_height_for_2to1,
+    uint8_t* scratch_buffer_2k_ptr) {
+  wuffs_base__pixel_swizzler__swizzle_ycc__upsample_func upfunc0 =
+      wuffs_base__pixel_swizzler__swizzle_ycc__upsample_funcs
+          [(inv_h0 - 1u) & 3u][(inv_v0 - 1u) & 3u];
+  wuffs_base__pixel_swizzler__swizzle_ycc__upsample_func upfunc1 =
+      wuffs_base__pixel_swizzler__swizzle_ycc__upsample_funcs
+          [(inv_h1 - 1u) & 3u][(inv_v1 - 1u) & 3u];
+  wuffs_base__pixel_swizzler__swizzle_ycc__upsample_func upfunc2 =
+      wuffs_base__pixel_swizzler__swizzle_ycc__upsample_funcs
+          [(inv_h2 - 1u) & 3u][(inv_v2 - 1u) & 3u];
+
+  uint32_t y;
+  for (y = 0u; y < height; y++) {
+    wuffs_base__pixel_swizzler__swizzle_ycc__general__triangle_filter_single_row(
+        dst, width, y,                 //
+        src_ptr0, src_ptr1, src_ptr2,  //
+        stride0, stride1, stride2,     //
+        inv_h0, inv_h1, inv_h2,        //
+        inv_v0, inv_v1, inv_v2,        //
+        half_width_for_2to1,           //
+        0u,                            //
+        scratch_buffer_2k_ptr,         //
+        upfunc0, upfunc1, upfunc2);
+  }
+}
+
+static void  //
+wuffs_base__pixel_swizzler__swizzle_ycc__general__box_filter(
+    wuffs_base__pixel_buffer* dst,
+    uint32_t width,
+    uint32_t height,
+    const uint8_t* src_ptr0,
+    const uint8_t* src_ptr1,
+    const uint8_t* src_ptr2,
+    uint32_t stride0,
+    uint32_t stride1,
+    uint32_t stride2,
+    uint32_t inv_h0,
+    uint32_t inv_h1,
+    uint32_t inv_h2,
+    uint32_t inv_v0,
+    uint32_t inv_v1,
+    uint32_t inv_v2) {
+  // Convert an inv_h or inv_v value from {1, 2, 3, 4} to {12, 6, 4, 3}.
+  uint32_t h0_out_of_12 = 12u / inv_h0;
+  uint32_t h1_out_of_12 = 12u / inv_h1;
+  uint32_t h2_out_of_12 = 12u / inv_h2;
+  uint32_t v0_out_of_12 = 12u / inv_v0;
+  uint32_t v1_out_of_12 = 12u / inv_v1;
+  uint32_t v2_out_of_12 = 12u / inv_v2;
+
+  uint32_t iy0 = 0u;
+  uint32_t iy1 = 0u;
+  uint32_t iy2 = 0u;
+  uint32_t y = 0u;
   while (true) {
     const uint8_t* src_iter0 = src_ptr0;
     const uint8_t* src_iter1 = src_ptr1;
     const uint8_t* src_iter2 = src_ptr2;
 
-    uint32_t ix0 = 0;
-    uint32_t ix1 = 0;
-    uint32_t ix2 = 0;
-    uint32_t x = 0;
+    uint32_t ix0 = 0u;
+    uint32_t ix1 = 0u;
+    uint32_t ix2 = 0u;
+    uint32_t x = 0u;
     while (true) {
       wuffs_base__pixel_buffer__set_color_u32_at(
           dst, x, y,
           wuffs_base__color_ycc__as__color_u32(*src_iter0, *src_iter1,
                                                *src_iter2));
 
-      if ((x + 1) == width) {
+      if ((x + 1u) == width) {
         break;
       }
-      x = x + 1;
+      x = x + 1u;
       ix0 += h0_out_of_12;
-      if (ix0 >= 12) {
-        ix0 = 0;
+      if (ix0 >= 12u) {
+        ix0 = 0u;
         src_iter0++;
       }
       ix1 += h1_out_of_12;
-      if (ix1 >= 12) {
-        ix1 = 0;
+      if (ix1 >= 12u) {
+        ix1 = 0u;
         src_iter1++;
       }
       ix2 += h2_out_of_12;
-      if (ix2 >= 12) {
-        ix2 = 0;
+      if (ix2 >= 12u) {
+        ix2 = 0u;
         src_iter2++;
       }
     }
 
-    if ((y + 1) == height) {
+    if ((y + 1u) == height) {
       break;
     }
-    y = y + 1;
+    y = y + 1u;
     iy0 += v0_out_of_12;
-    if (iy0 >= 12) {
-      iy0 = 0;
+    if (iy0 >= 12u) {
+      iy0 = 0u;
       src_ptr0 += stride0;
     }
     iy1 += v1_out_of_12;
-    if (iy1 >= 12) {
-      iy1 = 0;
+    if (iy1 >= 12u) {
+      iy1 = 0u;
       src_ptr1 += stride1;
     }
     iy2 += v2_out_of_12;
-    if (iy2 >= 12) {
-      iy2 = 0;
+    if (iy2 >= 12u) {
+      iy2 = 0u;
       src_ptr2 += stride2;
     }
   }
 }
 
+// --------
+
 // wuffs_base__pixel_swizzler__flattened_length is like
 // wuffs_base__table__flattened_length but returns uint64_t (not size_t) and
 // also accounts for subsampling.
@@ -127,66 +409,78 @@
                                              uint32_t stride,
                                              uint32_t inv_h,
                                              uint32_t inv_v) {
-  uint64_t scaled_width = (((uint64_t)width) + (inv_h - 1)) / inv_h;
-  uint64_t scaled_height = (((uint64_t)height) + (inv_v - 1)) / inv_v;
-  if (scaled_height <= 0) {
-    return 0;
+  uint64_t scaled_width = (((uint64_t)width) + (inv_h - 1u)) / inv_h;
+  uint64_t scaled_height = (((uint64_t)height) + (inv_v - 1u)) / inv_v;
+  if (scaled_height <= 0u) {
+    return 0u;
   }
-  return ((scaled_height - 1) * stride) + scaled_width;
+  return ((scaled_height - 1u) * stride) + scaled_width;
 }
 
 WUFFS_BASE__MAYBE_STATIC wuffs_base__status  //
-wuffs_base__pixel_swizzler__swizzle_ycck(const wuffs_base__pixel_swizzler* p,
-                                         wuffs_base__pixel_buffer* dst,
-                                         wuffs_base__slice_u8 dst_palette,
-                                         uint32_t width,
-                                         uint32_t height,
-                                         wuffs_base__slice_u8 src0,
-                                         wuffs_base__slice_u8 src1,
-                                         wuffs_base__slice_u8 src2,
-                                         wuffs_base__slice_u8 src3,
-                                         uint32_t width0,
-                                         uint32_t width1,
-                                         uint32_t width2,
-                                         uint32_t width3,
-                                         uint32_t height0,
-                                         uint32_t height1,
-                                         uint32_t height2,
-                                         uint32_t height3,
-                                         uint32_t stride0,
-                                         uint32_t stride1,
-                                         uint32_t stride2,
-                                         uint32_t stride3,
-                                         uint8_t h0,
-                                         uint8_t h1,
-                                         uint8_t h2,
-                                         uint8_t h3,
-                                         uint8_t v0,
-                                         uint8_t v1,
-                                         uint8_t v2,
-                                         uint8_t v3,
-                                         bool triangle_filter_for_2to1) {
+wuffs_base__pixel_swizzler__swizzle_ycck(
+    const wuffs_base__pixel_swizzler* p,
+    wuffs_base__pixel_buffer* dst,
+    wuffs_base__slice_u8 dst_palette,
+    uint32_t width,
+    uint32_t height,
+    wuffs_base__slice_u8 src0,
+    wuffs_base__slice_u8 src1,
+    wuffs_base__slice_u8 src2,
+    wuffs_base__slice_u8 src3,
+    uint32_t width0,
+    uint32_t width1,
+    uint32_t width2,
+    uint32_t width3,
+    uint32_t height0,
+    uint32_t height1,
+    uint32_t height2,
+    uint32_t height3,
+    uint32_t stride0,
+    uint32_t stride1,
+    uint32_t stride2,
+    uint32_t stride3,
+    uint8_t h0,
+    uint8_t h1,
+    uint8_t h2,
+    uint8_t h3,
+    uint8_t v0,
+    uint8_t v1,
+    uint8_t v2,
+    uint8_t v3,
+    bool triangle_filter_for_2to1,
+    wuffs_base__slice_u8 scratch_buffer_2k) {
   if (!p) {
     return wuffs_base__make_status(wuffs_base__error__bad_receiver);
-  } else if ((h3 != 0) || (v3 != 0) || triangle_filter_for_2to1) {
-    // TODO: support the K in YCCK and support triangle_filter_for_2to1.
+  } else if ((h3 != 0u) || (v3 != 0u)) {
+    // TODO: support the K in YCCK.
     return wuffs_base__make_status(
         wuffs_base__error__unsupported_pixel_swizzler_option);
-  } else if (!dst || (width > 0xFFFF) || (height > 0xFFFF) ||  //
-             (4 <= (h0 - 1)) || (4 <= (v0 - 1)) ||             //
-             (4 <= (h1 - 1)) || (4 <= (v1 - 1)) ||             //
-             (4 <= (h2 - 1)) || (4 <= (v2 - 1))) {
+  } else if (!dst || (width > 0xFFFFu) || (height > 0xFFFFu) ||  //
+             (4u <= (h0 - 1u)) || (4u <= (v0 - 1u)) ||           //
+             (4u <= (h1 - 1u)) || (4u <= (v1 - 1u)) ||           //
+             (4u <= (h2 - 1u)) || (4u <= (v2 - 1u)) ||           //
+             (scratch_buffer_2k.len < 2048u)) {
     return wuffs_base__make_status(wuffs_base__error__bad_argument);
   }
 
   uint32_t max_incl_h = wuffs_base__u32__max_of_4(h0, h1, h2, h3);
   uint32_t max_incl_v = wuffs_base__u32__max_of_4(v0, v1, v2, v3);
+
+  // Calculate the inverse h and v ratios.
+  //
+  // It also canonicalizes (h=2 and max_incl_h=4) as equivalent to (h=1 and
+  // max_incl_h=2). In both cases, the inv_h value is 2.
   uint32_t inv_h0 = max_incl_h / h0;
   uint32_t inv_h1 = max_incl_h / h1;
   uint32_t inv_h2 = max_incl_h / h2;
   uint32_t inv_v0 = max_incl_v / v0;
   uint32_t inv_v1 = max_incl_v / v1;
   uint32_t inv_v2 = max_incl_v / v2;
+
+  uint32_t half_width_for_2to1 = (width + 1u) / 2u;
+  uint32_t half_height_for_2to1 = (height + 1u) / 2u;
+
   width = wuffs_base__u32__min_of_5(  //
       width,                          //
       width0 * inv_h0,                //
@@ -246,19 +540,31 @@
           wuffs_base__error__unsupported_pixel_swizzler_option);
   }
 
-  if ((width <= 0) || (height <= 0)) {
+  if ((width <= 0u) || (height <= 0u)) {
     return wuffs_base__make_status(NULL);
   }
 
-  wuffs_base__pixel_swizzler__swizzle_ycc__general__box_filter(
-      p, dst, dst_palette, width, height,  //
-      src0.ptr, src1.ptr, src2.ptr,        //
-      stride0, stride1, stride2,           //
-      (h0 * 12) / max_incl_h,              //
-      (h1 * 12) / max_incl_h,              //
-      (h2 * 12) / max_incl_h,              //
-      (v0 * 12) / max_incl_v,              //
-      (v1 * 12) / max_incl_v,              //
-      (v2 * 12) / max_incl_v);
+  if (triangle_filter_for_2to1 &&
+      (wuffs_base__pixel_swizzler__has_triangle_upsampler(inv_h0, inv_v0) ||
+       wuffs_base__pixel_swizzler__has_triangle_upsampler(inv_h1, inv_v1) ||
+       wuffs_base__pixel_swizzler__has_triangle_upsampler(inv_h2, inv_v2))) {
+    wuffs_base__pixel_swizzler__swizzle_ycc__general__triangle_filter(
+        dst, width, height,                         //
+        src0.ptr, src1.ptr, src2.ptr,               //
+        stride0, stride1, stride2,                  //
+        inv_h0, inv_h1, inv_h2,                     //
+        inv_v0, inv_v1, inv_v2,                     //
+        half_width_for_2to1, half_height_for_2to1,  //
+        scratch_buffer_2k.ptr);
+
+  } else {
+    wuffs_base__pixel_swizzler__swizzle_ycc__general__box_filter(
+        dst, width, height,            //
+        src0.ptr, src1.ptr, src2.ptr,  //
+        stride0, stride1, stride2,     //
+        inv_h0, inv_h1, inv_h2,        //
+        inv_v0, inv_v1, inv_v2);
+  }
+
   return wuffs_base__make_status(NULL);
 }
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index d0c3215..fceef8a 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -672,7 +672,8 @@
 		"v1: u8[..= 4]," +
 		"v2: u8[..= 4]," +
 		"v3: u8[..= 4]," +
-		"triangle_filter_for_2to1: bool) status",
+		"triangle_filter_for_2to1: bool," +
+		"scratch_buffer_2k: slice u8) status",
 
 	// ---- arm_crc32_utility
 
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index e667f57..af37bd6 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -12733,36 +12733,38 @@
     uint64_t num_pixels);
 
 WUFFS_BASE__MAYBE_STATIC wuffs_base__status  //
-wuffs_base__pixel_swizzler__swizzle_ycck(const wuffs_base__pixel_swizzler* p,
-                                         wuffs_base__pixel_buffer* dst,
-                                         wuffs_base__slice_u8 dst_palette,
-                                         uint32_t width,
-                                         uint32_t height,
-                                         wuffs_base__slice_u8 src0,
-                                         wuffs_base__slice_u8 src1,
-                                         wuffs_base__slice_u8 src2,
-                                         wuffs_base__slice_u8 src3,
-                                         uint32_t width0,
-                                         uint32_t width1,
-                                         uint32_t width2,
-                                         uint32_t width3,
-                                         uint32_t height0,
-                                         uint32_t height1,
-                                         uint32_t height2,
-                                         uint32_t height3,
-                                         uint32_t stride0,
-                                         uint32_t stride1,
-                                         uint32_t stride2,
-                                         uint32_t stride3,
-                                         uint8_t h0,
-                                         uint8_t h1,
-                                         uint8_t h2,
-                                         uint8_t h3,
-                                         uint8_t v0,
-                                         uint8_t v1,
-                                         uint8_t v2,
-                                         uint8_t v3,
-                                         bool triangle_filter_for_2to1);
+wuffs_base__pixel_swizzler__swizzle_ycck(
+    const wuffs_base__pixel_swizzler* p,
+    wuffs_base__pixel_buffer* dst,
+    wuffs_base__slice_u8 dst_palette,
+    uint32_t width,
+    uint32_t height,
+    wuffs_base__slice_u8 src0,
+    wuffs_base__slice_u8 src1,
+    wuffs_base__slice_u8 src2,
+    wuffs_base__slice_u8 src3,
+    uint32_t width0,
+    uint32_t width1,
+    uint32_t width2,
+    uint32_t width3,
+    uint32_t height0,
+    uint32_t height1,
+    uint32_t height2,
+    uint32_t height3,
+    uint32_t stride0,
+    uint32_t stride1,
+    uint32_t stride2,
+    uint32_t stride3,
+    uint8_t h0,
+    uint8_t h1,
+    uint8_t h2,
+    uint8_t h3,
+    uint8_t v0,
+    uint8_t v1,
+    uint8_t v2,
+    uint8_t v3,
+    bool triangle_filter_for_2to1,
+    wuffs_base__slice_u8 scratch_buffer_2k);
 
 // ---------------- Images (Utility)
 
@@ -22680,14 +22682,241 @@
       e);
 }
 
-// Preconditions: see all the checks made in
-// wuffs_base__pixel_swizzler__swizzle_ycck before calling this function. For
+// --------
+
+// wuffs_base__pixel_swizzler__swizzle_ycc__upsample_func upsamples to a
+// destination slice at least 480 (YCCK) or 672 (YCC) bytes long and whose
+// src_len (multiplied by 1, 2, 3 or 4) is positive but no more than that. This
+// 480 or 672 length is just under 1/4 or 1/3 of the scratch_buffer_2k slice
+// length. Both (480 * 4) = 1920 and (672 * 3) = 2016 are less than 2048.
+//
+// 480 and 672 are nice round numbers because a JPEG MCU is 1, 2, 3 or 4 blocks
+// wide and each block is 8 pixels wide. We have:
+//   480 = 1 * 8 * 60,   672 = 1 * 8 * 84
+//   480 = 2 * 8 * 30,   672 = 2 * 8 * 42
+//   480 = 3 * 8 * 20,   672 = 3 * 8 * 28
+//   480 = 4 * 8 * 15,   672 = 4 * 8 * 21
+//
+// Box filters are equivalent to nearest neighbor upsampling. These ignore the
+// src_ptr_minor, h1v2_bias, first_column and last_column arguments.
+//
+// TODO: triangle filters.
+typedef const uint8_t* (
+    *wuffs_base__pixel_swizzler__swizzle_ycc__upsample_func)(
+    uint8_t* dst_ptr,
+    const uint8_t* src_ptr_major,  // Nearest row.
+    const uint8_t* src_ptr_minor,  // Adjacent row, alternating above or below.
+    size_t src_len,
+    uint32_t h1v2_bias,
+    bool first_column,
+    bool last_column);
+
+static const uint8_t*  //
+wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h1vn_box(
+    uint8_t* dst_ptr,
+    const uint8_t* src_ptr_major,
+    const uint8_t* src_ptr_minor_ignored,
+    size_t src_len,
+    uint32_t h1v2_bias_ignored,
+    bool first_column_ignored,
+    bool last_column_ignored) {
+  return src_ptr_major;
+}
+
+static const uint8_t*  //
+wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2vn_box(
+    uint8_t* dst_ptr,
+    const uint8_t* src_ptr_major,
+    const uint8_t* src_ptr_minor_ignored,
+    size_t src_len,
+    uint32_t h1v2_bias_ignored,
+    bool first_column_ignored,
+    bool last_column_ignored) {
+  uint8_t* dp = dst_ptr;
+  const uint8_t* sp = src_ptr_major;
+  while (src_len--) {
+    uint8_t sv = *sp++;
+    *dp++ = sv;
+    *dp++ = sv;
+  }
+  return dst_ptr;
+}
+
+static const uint8_t*  //
+wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h3vn_box(
+    uint8_t* dst_ptr,
+    const uint8_t* src_ptr_major,
+    const uint8_t* src_ptr_minor_ignored,
+    size_t src_len,
+    uint32_t h1v2_bias_ignored,
+    bool first_column_ignored,
+    bool last_column_ignored) {
+  uint8_t* dp = dst_ptr;
+  const uint8_t* sp = src_ptr_major;
+  while (src_len--) {
+    uint8_t sv = *sp++;
+    *dp++ = sv;
+    *dp++ = sv;
+    *dp++ = sv;
+  }
+  return dst_ptr;
+}
+
+static const uint8_t*  //
+wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h4vn_box(
+    uint8_t* dst_ptr,
+    const uint8_t* src_ptr_major,
+    const uint8_t* src_ptr_minor_ignored,
+    size_t src_len,
+    uint32_t h1v2_bias_ignored,
+    bool first_column_ignored,
+    bool last_column_ignored) {
+  uint8_t* dp = dst_ptr;
+  const uint8_t* sp = src_ptr_major;
+  while (src_len--) {
+    uint8_t sv = *sp++;
+    *dp++ = sv;
+    *dp++ = sv;
+    *dp++ = sv;
+    *dp++ = sv;
+  }
+  return dst_ptr;
+}
+
+// wuffs_base__pixel_swizzler__swizzle_ycc__upsample_funcs is indexed by inv_h
+// and then inv_v.
+static const wuffs_base__pixel_swizzler__swizzle_ycc__upsample_func
+    wuffs_base__pixel_swizzler__swizzle_ycc__upsample_funcs[4][4] = {
+        {
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h1vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h1vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h1vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h1vn_box,
+        },
+        {
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2vn_box,
+        },
+        {
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h3vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h3vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h3vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h3vn_box,
+        },
+        {
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h4vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h4vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h4vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h4vn_box,
+        },
+};
+
+static inline uint32_t  //
+wuffs_base__pixel_swizzler__has_triangle_upsampler(uint32_t inv_h,
+                                                   uint32_t inv_v) {
+  if (inv_h == 1u) {
+    return inv_v == 2u;
+  } else if (inv_h == 2u) {
+    return (inv_v == 1u) || (inv_v == 2u);
+  }
+  return false;
+}
+
+// --------
+
+// All of the wuffs_base__pixel_swizzler__swizzle_ycc__etc functions have
+// preconditions. See all of the checks made in
+// wuffs_base__pixel_swizzler__swizzle_ycck before calling these functions. For
 // example, (width > 0) is a precondition, but there are many more.
+
 static void  //
-wuffs_base__pixel_swizzler__swizzle_ycc__general__box_filter(
-    const wuffs_base__pixel_swizzler* p,
+wuffs_base__pixel_swizzler__swizzle_ycc__general__triangle_filter_single_row(
     wuffs_base__pixel_buffer* dst,
-    wuffs_base__slice_u8 dst_palette,
+    uint32_t width,
+    uint32_t y,
+    const uint8_t* src_ptr0,
+    const uint8_t* src_ptr1,
+    const uint8_t* src_ptr2,
+    uint32_t stride0,
+    uint32_t stride1,
+    uint32_t stride2,
+    uint32_t inv_h0,
+    uint32_t inv_h1,
+    uint32_t inv_h2,
+    uint32_t inv_v0,
+    uint32_t inv_v1,
+    uint32_t inv_v2,
+    uint32_t half_width_for_2to1,
+    uint32_t h1v2_bias,
+    uint8_t* scratch_buffer_2k_ptr,
+    wuffs_base__pixel_swizzler__swizzle_ycc__upsample_func upfunc0,
+    wuffs_base__pixel_swizzler__swizzle_ycc__upsample_func upfunc1,
+    wuffs_base__pixel_swizzler__swizzle_ycc__upsample_func upfunc2) {
+  const uint8_t* src0 = src_ptr0 + ((y / inv_v0) * (size_t)stride0);
+  const uint8_t* src1 = src_ptr1 + ((y / inv_v1) * (size_t)stride1);
+  const uint8_t* src2 = src_ptr2 + ((y / inv_v2) * (size_t)stride2);
+  uint32_t total_src_len0 = 0u;
+  uint32_t total_src_len1 = 0u;
+  uint32_t total_src_len2 = 0u;
+
+  uint32_t x = 0u;
+  while (x < width) {
+    bool first_column = x == 0u;
+    uint32_t end = x + 672u;
+    if (end > width) {
+      end = width;
+    }
+
+    uint32_t src_len0 = ((end - x) + inv_h0 - 1u) / inv_h0;
+    uint32_t src_len1 = ((end - x) + inv_h1 - 1u) / inv_h1;
+    uint32_t src_len2 = ((end - x) + inv_h2 - 1u) / inv_h2;
+    total_src_len0 += src_len0;
+    total_src_len1 += src_len1;
+    total_src_len2 += src_len2;
+
+    const uint8_t* src_ptr_x0 = src0 + (x / inv_h0);
+    const uint8_t* up0 = (*upfunc0)(          //
+        scratch_buffer_2k_ptr + (0u * 672u),  //
+        src_ptr_x0,                           //
+        src_ptr_x0,                           //
+        src_len0,                             //
+        h1v2_bias,                            //
+        first_column,                         //
+        (total_src_len0 >= half_width_for_2to1));
+
+    const uint8_t* src_ptr_x1 = src1 + (x / inv_h1);
+    const uint8_t* up1 = (*upfunc1)(          //
+        scratch_buffer_2k_ptr + (1u * 672u),  //
+        src_ptr_x1,                           //
+        src_ptr_x1,                           //
+        src_len1,                             //
+        h1v2_bias,                            //
+        first_column,                         //
+        (total_src_len1 >= half_width_for_2to1));
+
+    const uint8_t* src_ptr_x2 = src2 + (x / inv_h2);
+    const uint8_t* up2 = (*upfunc2)(          //
+        scratch_buffer_2k_ptr + (2u * 672u),  //
+        src_ptr_x2,                           //
+        src_ptr_x2,                           //
+        src_len2,                             //
+        h1v2_bias,                            //
+        first_column,                         //
+        (total_src_len2 >= half_width_for_2to1));
+
+    for (; x < end; x++) {
+      wuffs_base__pixel_buffer__set_color_u32_at(
+          dst, x, y,
+          wuffs_base__color_ycc__as__color_u32(*up0++, *up1++, *up2++));
+    }
+  }
+}
+
+static void  //
+wuffs_base__pixel_swizzler__swizzle_ycc__general__triangle_filter(
+    wuffs_base__pixel_buffer* dst,
     uint32_t width,
     uint32_t height,
     const uint8_t* src_ptr0,
@@ -22696,74 +22925,129 @@
     uint32_t stride0,
     uint32_t stride1,
     uint32_t stride2,
-    uint32_t h0_out_of_12,
-    uint32_t h1_out_of_12,
-    uint32_t h2_out_of_12,
-    uint32_t v0_out_of_12,
-    uint32_t v1_out_of_12,
-    uint32_t v2_out_of_12) {
-  uint32_t iy0 = 0;
-  uint32_t iy1 = 0;
-  uint32_t iy2 = 0;
-  uint32_t y = 0;
+    uint32_t inv_h0,
+    uint32_t inv_h1,
+    uint32_t inv_h2,
+    uint32_t inv_v0,
+    uint32_t inv_v1,
+    uint32_t inv_v2,
+    uint32_t half_width_for_2to1,
+    uint32_t half_height_for_2to1,
+    uint8_t* scratch_buffer_2k_ptr) {
+  wuffs_base__pixel_swizzler__swizzle_ycc__upsample_func upfunc0 =
+      wuffs_base__pixel_swizzler__swizzle_ycc__upsample_funcs
+          [(inv_h0 - 1u) & 3u][(inv_v0 - 1u) & 3u];
+  wuffs_base__pixel_swizzler__swizzle_ycc__upsample_func upfunc1 =
+      wuffs_base__pixel_swizzler__swizzle_ycc__upsample_funcs
+          [(inv_h1 - 1u) & 3u][(inv_v1 - 1u) & 3u];
+  wuffs_base__pixel_swizzler__swizzle_ycc__upsample_func upfunc2 =
+      wuffs_base__pixel_swizzler__swizzle_ycc__upsample_funcs
+          [(inv_h2 - 1u) & 3u][(inv_v2 - 1u) & 3u];
+
+  uint32_t y;
+  for (y = 0u; y < height; y++) {
+    wuffs_base__pixel_swizzler__swizzle_ycc__general__triangle_filter_single_row(
+        dst, width, y,                 //
+        src_ptr0, src_ptr1, src_ptr2,  //
+        stride0, stride1, stride2,     //
+        inv_h0, inv_h1, inv_h2,        //
+        inv_v0, inv_v1, inv_v2,        //
+        half_width_for_2to1,           //
+        0u,                            //
+        scratch_buffer_2k_ptr,         //
+        upfunc0, upfunc1, upfunc2);
+  }
+}
+
+static void  //
+wuffs_base__pixel_swizzler__swizzle_ycc__general__box_filter(
+    wuffs_base__pixel_buffer* dst,
+    uint32_t width,
+    uint32_t height,
+    const uint8_t* src_ptr0,
+    const uint8_t* src_ptr1,
+    const uint8_t* src_ptr2,
+    uint32_t stride0,
+    uint32_t stride1,
+    uint32_t stride2,
+    uint32_t inv_h0,
+    uint32_t inv_h1,
+    uint32_t inv_h2,
+    uint32_t inv_v0,
+    uint32_t inv_v1,
+    uint32_t inv_v2) {
+  // Convert an inv_h or inv_v value from {1, 2, 3, 4} to {12, 6, 4, 3}.
+  uint32_t h0_out_of_12 = 12u / inv_h0;
+  uint32_t h1_out_of_12 = 12u / inv_h1;
+  uint32_t h2_out_of_12 = 12u / inv_h2;
+  uint32_t v0_out_of_12 = 12u / inv_v0;
+  uint32_t v1_out_of_12 = 12u / inv_v1;
+  uint32_t v2_out_of_12 = 12u / inv_v2;
+
+  uint32_t iy0 = 0u;
+  uint32_t iy1 = 0u;
+  uint32_t iy2 = 0u;
+  uint32_t y = 0u;
   while (true) {
     const uint8_t* src_iter0 = src_ptr0;
     const uint8_t* src_iter1 = src_ptr1;
     const uint8_t* src_iter2 = src_ptr2;
 
-    uint32_t ix0 = 0;
-    uint32_t ix1 = 0;
-    uint32_t ix2 = 0;
-    uint32_t x = 0;
+    uint32_t ix0 = 0u;
+    uint32_t ix1 = 0u;
+    uint32_t ix2 = 0u;
+    uint32_t x = 0u;
     while (true) {
       wuffs_base__pixel_buffer__set_color_u32_at(
           dst, x, y,
           wuffs_base__color_ycc__as__color_u32(*src_iter0, *src_iter1,
                                                *src_iter2));
 
-      if ((x + 1) == width) {
+      if ((x + 1u) == width) {
         break;
       }
-      x = x + 1;
+      x = x + 1u;
       ix0 += h0_out_of_12;
-      if (ix0 >= 12) {
-        ix0 = 0;
+      if (ix0 >= 12u) {
+        ix0 = 0u;
         src_iter0++;
       }
       ix1 += h1_out_of_12;
-      if (ix1 >= 12) {
-        ix1 = 0;
+      if (ix1 >= 12u) {
+        ix1 = 0u;
         src_iter1++;
       }
       ix2 += h2_out_of_12;
-      if (ix2 >= 12) {
-        ix2 = 0;
+      if (ix2 >= 12u) {
+        ix2 = 0u;
         src_iter2++;
       }
     }
 
-    if ((y + 1) == height) {
+    if ((y + 1u) == height) {
       break;
     }
-    y = y + 1;
+    y = y + 1u;
     iy0 += v0_out_of_12;
-    if (iy0 >= 12) {
-      iy0 = 0;
+    if (iy0 >= 12u) {
+      iy0 = 0u;
       src_ptr0 += stride0;
     }
     iy1 += v1_out_of_12;
-    if (iy1 >= 12) {
-      iy1 = 0;
+    if (iy1 >= 12u) {
+      iy1 = 0u;
       src_ptr1 += stride1;
     }
     iy2 += v2_out_of_12;
-    if (iy2 >= 12) {
-      iy2 = 0;
+    if (iy2 >= 12u) {
+      iy2 = 0u;
       src_ptr2 += stride2;
     }
   }
 }
 
+// --------
+
 // wuffs_base__pixel_swizzler__flattened_length is like
 // wuffs_base__table__flattened_length but returns uint64_t (not size_t) and
 // also accounts for subsampling.
@@ -22773,66 +23057,78 @@
                                              uint32_t stride,
                                              uint32_t inv_h,
                                              uint32_t inv_v) {
-  uint64_t scaled_width = (((uint64_t)width) + (inv_h - 1)) / inv_h;
-  uint64_t scaled_height = (((uint64_t)height) + (inv_v - 1)) / inv_v;
-  if (scaled_height <= 0) {
-    return 0;
+  uint64_t scaled_width = (((uint64_t)width) + (inv_h - 1u)) / inv_h;
+  uint64_t scaled_height = (((uint64_t)height) + (inv_v - 1u)) / inv_v;
+  if (scaled_height <= 0u) {
+    return 0u;
   }
-  return ((scaled_height - 1) * stride) + scaled_width;
+  return ((scaled_height - 1u) * stride) + scaled_width;
 }
 
 WUFFS_BASE__MAYBE_STATIC wuffs_base__status  //
-wuffs_base__pixel_swizzler__swizzle_ycck(const wuffs_base__pixel_swizzler* p,
-                                         wuffs_base__pixel_buffer* dst,
-                                         wuffs_base__slice_u8 dst_palette,
-                                         uint32_t width,
-                                         uint32_t height,
-                                         wuffs_base__slice_u8 src0,
-                                         wuffs_base__slice_u8 src1,
-                                         wuffs_base__slice_u8 src2,
-                                         wuffs_base__slice_u8 src3,
-                                         uint32_t width0,
-                                         uint32_t width1,
-                                         uint32_t width2,
-                                         uint32_t width3,
-                                         uint32_t height0,
-                                         uint32_t height1,
-                                         uint32_t height2,
-                                         uint32_t height3,
-                                         uint32_t stride0,
-                                         uint32_t stride1,
-                                         uint32_t stride2,
-                                         uint32_t stride3,
-                                         uint8_t h0,
-                                         uint8_t h1,
-                                         uint8_t h2,
-                                         uint8_t h3,
-                                         uint8_t v0,
-                                         uint8_t v1,
-                                         uint8_t v2,
-                                         uint8_t v3,
-                                         bool triangle_filter_for_2to1) {
+wuffs_base__pixel_swizzler__swizzle_ycck(
+    const wuffs_base__pixel_swizzler* p,
+    wuffs_base__pixel_buffer* dst,
+    wuffs_base__slice_u8 dst_palette,
+    uint32_t width,
+    uint32_t height,
+    wuffs_base__slice_u8 src0,
+    wuffs_base__slice_u8 src1,
+    wuffs_base__slice_u8 src2,
+    wuffs_base__slice_u8 src3,
+    uint32_t width0,
+    uint32_t width1,
+    uint32_t width2,
+    uint32_t width3,
+    uint32_t height0,
+    uint32_t height1,
+    uint32_t height2,
+    uint32_t height3,
+    uint32_t stride0,
+    uint32_t stride1,
+    uint32_t stride2,
+    uint32_t stride3,
+    uint8_t h0,
+    uint8_t h1,
+    uint8_t h2,
+    uint8_t h3,
+    uint8_t v0,
+    uint8_t v1,
+    uint8_t v2,
+    uint8_t v3,
+    bool triangle_filter_for_2to1,
+    wuffs_base__slice_u8 scratch_buffer_2k) {
   if (!p) {
     return wuffs_base__make_status(wuffs_base__error__bad_receiver);
-  } else if ((h3 != 0) || (v3 != 0) || triangle_filter_for_2to1) {
-    // TODO: support the K in YCCK and support triangle_filter_for_2to1.
+  } else if ((h3 != 0u) || (v3 != 0u)) {
+    // TODO: support the K in YCCK.
     return wuffs_base__make_status(
         wuffs_base__error__unsupported_pixel_swizzler_option);
-  } else if (!dst || (width > 0xFFFF) || (height > 0xFFFF) ||  //
-             (4 <= (h0 - 1)) || (4 <= (v0 - 1)) ||             //
-             (4 <= (h1 - 1)) || (4 <= (v1 - 1)) ||             //
-             (4 <= (h2 - 1)) || (4 <= (v2 - 1))) {
+  } else if (!dst || (width > 0xFFFFu) || (height > 0xFFFFu) ||  //
+             (4u <= (h0 - 1u)) || (4u <= (v0 - 1u)) ||           //
+             (4u <= (h1 - 1u)) || (4u <= (v1 - 1u)) ||           //
+             (4u <= (h2 - 1u)) || (4u <= (v2 - 1u)) ||           //
+             (scratch_buffer_2k.len < 2048u)) {
     return wuffs_base__make_status(wuffs_base__error__bad_argument);
   }
 
   uint32_t max_incl_h = wuffs_base__u32__max_of_4(h0, h1, h2, h3);
   uint32_t max_incl_v = wuffs_base__u32__max_of_4(v0, v1, v2, v3);
+
+  // Calculate the inverse h and v ratios.
+  //
+  // It also canonicalizes (h=2 and max_incl_h=4) as equivalent to (h=1 and
+  // max_incl_h=2). In both cases, the inv_h value is 2.
   uint32_t inv_h0 = max_incl_h / h0;
   uint32_t inv_h1 = max_incl_h / h1;
   uint32_t inv_h2 = max_incl_h / h2;
   uint32_t inv_v0 = max_incl_v / v0;
   uint32_t inv_v1 = max_incl_v / v1;
   uint32_t inv_v2 = max_incl_v / v2;
+
+  uint32_t half_width_for_2to1 = (width + 1u) / 2u;
+  uint32_t half_height_for_2to1 = (height + 1u) / 2u;
+
   width = wuffs_base__u32__min_of_5(  //
       width,                          //
       width0 * inv_h0,                //
@@ -22892,20 +23188,32 @@
           wuffs_base__error__unsupported_pixel_swizzler_option);
   }
 
-  if ((width <= 0) || (height <= 0)) {
+  if ((width <= 0u) || (height <= 0u)) {
     return wuffs_base__make_status(NULL);
   }
 
-  wuffs_base__pixel_swizzler__swizzle_ycc__general__box_filter(
-      p, dst, dst_palette, width, height,  //
-      src0.ptr, src1.ptr, src2.ptr,        //
-      stride0, stride1, stride2,           //
-      (h0 * 12) / max_incl_h,              //
-      (h1 * 12) / max_incl_h,              //
-      (h2 * 12) / max_incl_h,              //
-      (v0 * 12) / max_incl_v,              //
-      (v1 * 12) / max_incl_v,              //
-      (v2 * 12) / max_incl_v);
+  if (triangle_filter_for_2to1 &&
+      (wuffs_base__pixel_swizzler__has_triangle_upsampler(inv_h0, inv_v0) ||
+       wuffs_base__pixel_swizzler__has_triangle_upsampler(inv_h1, inv_v1) ||
+       wuffs_base__pixel_swizzler__has_triangle_upsampler(inv_h2, inv_v2))) {
+    wuffs_base__pixel_swizzler__swizzle_ycc__general__triangle_filter(
+        dst, width, height,                         //
+        src0.ptr, src1.ptr, src2.ptr,               //
+        stride0, stride1, stride2,                  //
+        inv_h0, inv_h1, inv_h2,                     //
+        inv_v0, inv_v1, inv_v2,                     //
+        half_width_for_2to1, half_height_for_2to1,  //
+        scratch_buffer_2k.ptr);
+
+  } else {
+    wuffs_base__pixel_swizzler__swizzle_ycc__general__box_filter(
+        dst, width, height,            //
+        src0.ptr, src1.ptr, src2.ptr,  //
+        stride0, stride1, stride2,     //
+        inv_h0, inv_h1, inv_h2,        //
+        inv_v0, inv_v1, inv_v2);
+  }
+
   return wuffs_base__make_status(NULL);
 }
 
@@ -37757,6 +38065,7 @@
   bool v_has_h3 = false;
   bool v_has_v24 = false;
   bool v_has_v3 = false;
+  uint32_t v_upper_bound = 0;
 
   const uint8_t* iop_a_src = NULL;
   const uint8_t* io0_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
@@ -37988,28 +38297,15 @@
     } else {
       self->private_impl.f_height_in_mcus = ((self->private_impl.f_height + 31) / 32);
     }
-    v_i = 0;
-    while (v_i < self->private_impl.f_num_components) {
-      if (self->private_impl.f_components_h[v_i] == 1) {
-        self->private_impl.f_components_workbuf_widths[v_i] = (((self->private_impl.f_width + 7) / 8) * 8);
-      } else if (self->private_impl.f_components_h[v_i] == 2) {
-        self->private_impl.f_components_workbuf_widths[v_i] = (((self->private_impl.f_width + 15) / 16) * 16);
-      } else if (self->private_impl.f_components_h[v_i] == 3) {
-        self->private_impl.f_components_workbuf_widths[v_i] = (((self->private_impl.f_width + 23) / 24) * 24);
-      } else {
-        self->private_impl.f_components_workbuf_widths[v_i] = (((self->private_impl.f_width + 31) / 32) * 32);
-      }
-      if (self->private_impl.f_components_v[v_i] == 1) {
-        self->private_impl.f_components_workbuf_heights[v_i] = (((self->private_impl.f_height + 7) / 8) * 8);
-      } else if (self->private_impl.f_components_v[v_i] == 2) {
-        self->private_impl.f_components_workbuf_heights[v_i] = (((self->private_impl.f_height + 15) / 16) * 16);
-      } else if (self->private_impl.f_components_v[v_i] == 3) {
-        self->private_impl.f_components_workbuf_heights[v_i] = (((self->private_impl.f_height + 23) / 24) * 24);
-      } else {
-        self->private_impl.f_components_workbuf_heights[v_i] = (((self->private_impl.f_height + 31) / 32) * 32);
-      }
-      v_i += 1;
-    }
+    v_upper_bound = 65544;
+    self->private_impl.f_components_workbuf_widths[0] = wuffs_base__u32__min(v_upper_bound, (8 * self->private_impl.f_width_in_mcus * ((uint32_t)(self->private_impl.f_components_h[0]))));
+    self->private_impl.f_components_workbuf_widths[1] = wuffs_base__u32__min(v_upper_bound, (8 * self->private_impl.f_width_in_mcus * ((uint32_t)(self->private_impl.f_components_h[1]))));
+    self->private_impl.f_components_workbuf_widths[2] = wuffs_base__u32__min(v_upper_bound, (8 * self->private_impl.f_width_in_mcus * ((uint32_t)(self->private_impl.f_components_h[2]))));
+    self->private_impl.f_components_workbuf_widths[3] = wuffs_base__u32__min(v_upper_bound, (8 * self->private_impl.f_width_in_mcus * ((uint32_t)(self->private_impl.f_components_h[3]))));
+    self->private_impl.f_components_workbuf_heights[0] = wuffs_base__u32__min(v_upper_bound, (8 * self->private_impl.f_height_in_mcus * ((uint32_t)(self->private_impl.f_components_v[0]))));
+    self->private_impl.f_components_workbuf_heights[1] = wuffs_base__u32__min(v_upper_bound, (8 * self->private_impl.f_height_in_mcus * ((uint32_t)(self->private_impl.f_components_v[1]))));
+    self->private_impl.f_components_workbuf_heights[2] = wuffs_base__u32__min(v_upper_bound, (8 * self->private_impl.f_height_in_mcus * ((uint32_t)(self->private_impl.f_components_v[2]))));
+    self->private_impl.f_components_workbuf_heights[3] = wuffs_base__u32__min(v_upper_bound, (8 * self->private_impl.f_height_in_mcus * ((uint32_t)(self->private_impl.f_components_v[3]))));
     self->private_impl.f_components_workbuf_offsets[0] = 0;
     self->private_impl.f_components_workbuf_offsets[1] = (self->private_impl.f_components_workbuf_offsets[0] + (((uint64_t)(self->private_impl.f_components_workbuf_widths[0])) * ((uint64_t)(self->private_impl.f_components_workbuf_heights[0]))));
     self->private_impl.f_components_workbuf_offsets[2] = (self->private_impl.f_components_workbuf_offsets[1] + (((uint64_t)(self->private_impl.f_components_workbuf_widths[1])) * ((uint64_t)(self->private_impl.f_components_workbuf_heights[1]))));
@@ -39419,7 +39715,8 @@
       self->private_impl.f_components_v[1],
       self->private_impl.f_components_v[2],
       self->private_impl.f_components_v[3],
-      false);
+      true,
+      wuffs_base__make_slice_u8(self->private_data.f_bitstream_buffer, 2048));
   return wuffs_base__status__ensure_not_a_suspension(v_status);
 }
 
diff --git a/std/jpeg/decode_jpeg.wuffs b/std/jpeg/decode_jpeg.wuffs
index 65a4461..c3bcaae 100644
--- a/std/jpeg/decode_jpeg.wuffs
+++ b/std/jpeg/decode_jpeg.wuffs
@@ -366,16 +366,19 @@
 }
 
 pri func decoder.decode_sof?(src: base.io_reader) {
-    var c       : base.u8
-    var comp_h  : base.u8
-    var comp_v  : base.u8
-    var i       : base.u32
-    var j       : base.u32
+    var c      : base.u8
+    var comp_h : base.u8
+    var comp_v : base.u8
+    var i      : base.u32
+    var j      : base.u32
+
     var has_h24 : base.bool
     var has_h3  : base.bool
     var has_v24 : base.bool
     var has_v3  : base.bool
 
+    var upper_bound : base.u32[..= 0x1_0008]
+
     if this.payload_length < 6 {
         return "#bad SOF marker"
     }
@@ -498,32 +501,25 @@
         this.height_in_mcus = (this.height + 0x1F) / 0x20
     }
 
-    i = 0
-    while i < this.num_components {
-        assert i < 4 via "a < b: a < c; c <= b"(c: this.num_components)
+    upper_bound = 0x1_0008
 
-        if this.components_h[i] == 1 {
-            this.components_workbuf_widths[i] = ((this.width + 0x07) / 0x08) * 0x08
-        } else if this.components_h[i] == 2 {
-            this.components_workbuf_widths[i] = ((this.width + 0x0F) / 0x10) * 0x10
-        } else if this.components_h[i] == 3 {
-            this.components_workbuf_widths[i] = ((this.width + 0x17) / 0x18) * 0x18
-        } else {
-            this.components_workbuf_widths[i] = ((this.width + 0x1F) / 0x20) * 0x20
-        }
+    this.components_workbuf_widths[0] = upper_bound.min(no_more_than:
+            8 * this.width_in_mcus * (this.components_h[0] as base.u32))
+    this.components_workbuf_widths[1] = upper_bound.min(no_more_than:
+            8 * this.width_in_mcus * (this.components_h[1] as base.u32))
+    this.components_workbuf_widths[2] = upper_bound.min(no_more_than:
+            8 * this.width_in_mcus * (this.components_h[2] as base.u32))
+    this.components_workbuf_widths[3] = upper_bound.min(no_more_than:
+            8 * this.width_in_mcus * (this.components_h[3] as base.u32))
 
-        if this.components_v[i] == 1 {
-            this.components_workbuf_heights[i] = ((this.height + 0x07) / 0x08) * 0x08
-        } else if this.components_v[i] == 2 {
-            this.components_workbuf_heights[i] = ((this.height + 0x0F) / 0x10) * 0x10
-        } else if this.components_v[i] == 3 {
-            this.components_workbuf_heights[i] = ((this.height + 0x17) / 0x18) * 0x18
-        } else {
-            this.components_workbuf_heights[i] = ((this.height + 0x1F) / 0x20) * 0x20
-        }
-
-        i += 1
-    } endwhile
+    this.components_workbuf_heights[0] = upper_bound.min(no_more_than:
+            8 * this.height_in_mcus * (this.components_v[0] as base.u32))
+    this.components_workbuf_heights[1] = upper_bound.min(no_more_than:
+            8 * this.height_in_mcus * (this.components_v[1] as base.u32))
+    this.components_workbuf_heights[2] = upper_bound.min(no_more_than:
+            8 * this.height_in_mcus * (this.components_v[2] as base.u32))
+    this.components_workbuf_heights[3] = upper_bound.min(no_more_than:
+            8 * this.height_in_mcus * (this.components_v[3] as base.u32))
 
     this.components_workbuf_offsets[0] = 0
     this.components_workbuf_offsets[1] = this.components_workbuf_offsets[0] +
@@ -1437,7 +1433,8 @@
             v1: this.components_v[1],
             v2: this.components_v[2],
             v3: this.components_v[3],
-            triangle_filter_for_2to1: false)
+            triangle_filter_for_2to1: true,
+            scratch_buffer_2k: this.bitstream_buffer[..])
     return status
 }
 
diff --git a/test/c/std/wbmp.c b/test/c/std/wbmp.c
index 640d60d..7090e03 100644
--- a/test/c/std/wbmp.c
+++ b/test/c/std/wbmp.c
@@ -512,6 +512,36 @@
   return NULL;
 }
 
+const char*  //
+test_wuffs_upsample_inv_h2v1() {
+  CHECK_FOCUS(__func__);
+
+  // src_array0 is "A lovely example".
+  const uint8_t src_array0[16] = {
+      0x41, 0x20, 0x6C, 0x6F, 0x76, 0x65, 0x6C, 0x79,  //
+      0x20, 0x65, 0x78, 0x61, 0x6D, 0x70, 0x6C, 0x65,  //
+  };
+
+  const uint8_t want_array[32] = {
+      0x41, 0x41, 0x20, 0x20, 0x6C, 0x6C, 0x6F, 0x6F,  //
+      0x76, 0x76, 0x65, 0x65, 0x6C, 0x6C, 0x79, 0x79,  //
+      0x20, 0x20, 0x65, 0x65, 0x78, 0x78, 0x61, 0x61,  //
+      0x6D, 0x6D, 0x70, 0x70, 0x6C, 0x6C, 0x65, 0x65,  //
+  };
+
+  const uint8_t* have_ptr =
+      wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2vn_box(
+          g_have_array_u8, src_array0, src_array0, 16, 0, true, true);
+
+  const bool closed = true;
+  wuffs_base__io_buffer have = wuffs_base__ptr_u8__reader(  //
+      (void*)have_ptr, 32, closed);
+  wuffs_base__io_buffer want = wuffs_base__ptr_u8__reader(  //
+      (void*)want_array, 32, closed);
+
+  return check_io_buffers_equal("", &have, &want);
+}
+
 // ---------------- WBMP Tests
 
 const char*  //
@@ -769,6 +799,7 @@
     test_wuffs_color_ycc_as_color_u32,
     test_wuffs_pixel_buffer_fill_rect,
     test_wuffs_pixel_swizzler_swizzle,
+    test_wuffs_upsample_inv_h2v1,
 
     test_wuffs_wbmp_decode_frame_config,
     test_wuffs_wbmp_decode_image_config,