std/jpeg: implement triangle-filter upsampling

"script/confirm-matches-djpeg.sh test/data/*.jpeg" is now all matches.
diff --git a/internal/cgen/base/pixconv-submodule-ycck.c b/internal/cgen/base/pixconv-submodule-ycck.c
index 004b1e2..44df25a 100644
--- a/internal/cgen/base/pixconv-submodule-ycck.c
+++ b/internal/cgen/base/pixconv-submodule-ycck.c
@@ -52,7 +52,14 @@
 // Box filters are equivalent to nearest neighbor upsampling. These ignore the
 // src_ptr_minor, h1v2_bias, first_column and last_column arguments.
 //
-// TODO: triangle filters.
+// Triangle filters use a 3:1 ratio (in 1 dimension), or 9:3:3:1 (in 2
+// dimensions), which is higher quality (less blocky) but also higher
+// computational effort.
+//
+// In theory, we could use triangle filters for any (inv_h, inv_v) combination.
+// In practice, matching libjpeg-turbo, we only implement it for the common
+// chroma subsampling ratios (YCC420, YCC422 or YCC440), corresponding to an
+// (inv_h, inv_v) pair of (2, 2), (2, 1) or (1, 2).
 typedef const uint8_t* (
     *wuffs_base__pixel_swizzler__swizzle_ycc__upsample_func)(
     uint8_t* dst_ptr,
@@ -135,19 +142,159 @@
   return dst_ptr;
 }
 
+static const uint8_t*  //
+wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h1v2_triangle(
+    uint8_t* dst_ptr,
+    const uint8_t* src_ptr_major,
+    const uint8_t* src_ptr_minor,
+    size_t src_len,
+    uint32_t h1v2_bias,
+    bool first_column,
+    bool last_column) {
+  uint8_t* dp = dst_ptr;
+  const uint8_t* sp_major = src_ptr_major;
+  const uint8_t* sp_minor = src_ptr_minor;
+  while (src_len--) {
+    *dp++ = (uint8_t)(((3u * ((uint32_t)(*sp_major++))) +  //
+                       (1u * ((uint32_t)(*sp_minor++))) +  //
+                       h1v2_bias) >>
+                      2u);
+  }
+  return dst_ptr;
+}
+
+static const uint8_t*  //
+wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2v1_triangle(
+    uint8_t* dst_ptr,
+    const uint8_t* src_ptr_major,
+    const uint8_t* src_ptr_minor,
+    size_t src_len,
+    uint32_t h1v2_bias_ignored,
+    bool first_column,
+    bool last_column) {
+  uint8_t* dp = dst_ptr;
+  const uint8_t* sp = src_ptr_major;
+
+  if (first_column) {
+    src_len--;
+    if ((src_len <= 0u) && last_column) {
+      uint8_t sv = *sp++;
+      *dp++ = sv;
+      *dp++ = sv;
+      return dst_ptr;
+    }
+    uint32_t svp1 = sp[+1];
+    uint8_t sv = *sp++;
+    *dp++ = sv;
+    *dp++ = (uint8_t)(((3u * (uint32_t)sv) + svp1 + 2u) >> 2u);
+    if (src_len <= 0u) {
+      return dst_ptr;
+    }
+  }
+
+  if (last_column) {
+    src_len--;
+  }
+
+  for (; src_len > 0u; src_len--) {
+    uint32_t svm1 = sp[-1];
+    uint32_t svp1 = sp[+1];
+    uint32_t sv3 = 3u * (uint32_t)(*sp++);
+    *dp++ = (uint8_t)((sv3 + svm1 + 1u) >> 2u);
+    *dp++ = (uint8_t)((sv3 + svp1 + 2u) >> 2u);
+  }
+
+  if (last_column) {
+    uint32_t svm1 = sp[-1];
+    uint8_t sv = *sp++;
+    *dp++ = (uint8_t)(((3u * (uint32_t)sv) + svm1 + 1u) >> 2u);
+    *dp++ = sv;
+  }
+
+  return dst_ptr;
+}
+
+static const uint8_t*  //
+wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2v2_triangle(
+    uint8_t* dst_ptr,
+    const uint8_t* src_ptr_major,
+    const uint8_t* src_ptr_minor,
+    size_t src_len,
+    uint32_t h1v2_bias_ignored,
+    bool first_column,
+    bool last_column) {
+  uint8_t* dp = dst_ptr;
+  const uint8_t* sp_major = src_ptr_major;
+  const uint8_t* sp_minor = src_ptr_minor;
+
+  if (first_column) {
+    src_len--;
+    if ((src_len <= 0u) && last_column) {
+      uint32_t sv = (12u * ((uint32_t)(*sp_major++))) +  //
+                    (4u * ((uint32_t)(*sp_minor++)));
+      *dp++ = (uint8_t)((sv + 8u) >> 4u);
+      *dp++ = (uint8_t)((sv + 7u) >> 4u);
+      return dst_ptr;
+    }
+
+    uint32_t sv_major_m1 = sp_major[-0];  // Clamp offset to zero.
+    uint32_t sv_minor_m1 = sp_minor[-0];  // Clamp offset to zero.
+    uint32_t sv_major_p1 = sp_major[+1];
+    uint32_t sv_minor_p1 = sp_minor[+1];
+
+    uint32_t sv = (9u * ((uint32_t)(*sp_major++))) +  //
+                  (3u * ((uint32_t)(*sp_minor++)));
+    *dp++ = (uint8_t)((sv + (3u * sv_major_m1) + (sv_minor_m1) + 8u) >> 4u);
+    *dp++ = (uint8_t)((sv + (3u * sv_major_p1) + (sv_minor_p1) + 7u) >> 4u);
+    if (src_len <= 0u) {
+      return dst_ptr;
+    }
+  }
+
+  if (last_column) {
+    src_len--;
+  }
+
+  for (; src_len > 0u; src_len--) {
+    uint32_t sv_major_m1 = sp_major[-1];
+    uint32_t sv_minor_m1 = sp_minor[-1];
+    uint32_t sv_major_p1 = sp_major[+1];
+    uint32_t sv_minor_p1 = sp_minor[+1];
+
+    uint32_t sv = (9u * ((uint32_t)(*sp_major++))) +  //
+                  (3u * ((uint32_t)(*sp_minor++)));
+    *dp++ = (uint8_t)((sv + (3u * sv_major_m1) + (sv_minor_m1) + 8u) >> 4u);
+    *dp++ = (uint8_t)((sv + (3u * sv_major_p1) + (sv_minor_p1) + 7u) >> 4u);
+  }
+
+  if (last_column) {
+    uint32_t sv_major_m1 = sp_major[-1];
+    uint32_t sv_minor_m1 = sp_minor[-1];
+    uint32_t sv_major_p1 = sp_major[+0];  // Clamp offset to zero.
+    uint32_t sv_minor_p1 = sp_minor[+0];  // Clamp offset to zero.
+
+    uint32_t sv = (9u * ((uint32_t)(*sp_major++))) +  //
+                  (3u * ((uint32_t)(*sp_minor++)));
+    *dp++ = (uint8_t)((sv + (3u * sv_major_m1) + (sv_minor_m1) + 8u) >> 4u);
+    *dp++ = (uint8_t)((sv + (3u * sv_major_p1) + (sv_minor_p1) + 7u) >> 4u);
+  }
+
+  return dst_ptr;
+}
+
 // wuffs_base__pixel_swizzler__swizzle_ycc__upsample_funcs is indexed by inv_h
 // and then inv_v.
 static const wuffs_base__pixel_swizzler__swizzle_ycc__upsample_func
     wuffs_base__pixel_swizzler__swizzle_ycc__upsample_funcs[4][4] = {
         {
             wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h1vn_box,
-            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h1vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h1v2_triangle,
             wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h1vn_box,
             wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h1vn_box,
         },
         {
-            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2vn_box,
-            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2v1_triangle,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2v2_triangle,
             wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2vn_box,
             wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2vn_box,
         },
@@ -184,7 +331,7 @@
 // example, (width > 0) is a precondition, but there are many more.
 
 static void  //
-wuffs_base__pixel_swizzler__swizzle_ycc__general__triangle_filter_single_row(
+wuffs_base__pixel_swizzler__swizzle_ycc__general__triangle_filter_edge_row(
     wuffs_base__pixel_buffer* dst,
     uint32_t width,
     uint32_t y,
@@ -296,16 +443,106 @@
       wuffs_base__pixel_swizzler__swizzle_ycc__upsample_funcs
           [(inv_h2 - 1u) & 3u][(inv_v2 - 1u) & 3u];
 
+  // First row.
+  uint32_t h1v2_bias = 1u;
+  wuffs_base__pixel_swizzler__swizzle_ycc__general__triangle_filter_edge_row(
+      dst, width, 0u,                //
+      src_ptr0, src_ptr1, src_ptr2,  //
+      stride0, stride1, stride2,     //
+      inv_h0, inv_h1, inv_h2,        //
+      inv_v0, inv_v1, inv_v2,        //
+      half_width_for_2to1,           //
+      h1v2_bias,                     //
+      scratch_buffer_2k_ptr,         //
+      upfunc0, upfunc1, upfunc2);
+  h1v2_bias = 2u;
+
+  // Middle rows.
+  bool last_row = height == 2u * half_height_for_2to1;
+  uint32_t y_max_excl = last_row ? (height - 1u) : height;
   uint32_t y;
-  for (y = 0u; y < height; y++) {
-    wuffs_base__pixel_swizzler__swizzle_ycc__general__triangle_filter_single_row(
-        dst, width, y,                 //
+  for (y = 1u; y < y_max_excl; y++) {
+    const uint8_t* src0_major = src_ptr0 + ((y / inv_v0) * (size_t)stride0);
+    const uint8_t* src0_minor =
+        (inv_v0 != 2u)
+            ? src0_major
+            : ((y & 1u) ? (src0_major + stride0) : (src0_major - stride0));
+    const uint8_t* src1_major = src_ptr1 + ((y / inv_v1) * (size_t)stride1);
+    const uint8_t* src1_minor =
+        (inv_v1 != 2u)
+            ? src1_major
+            : ((y & 1u) ? (src1_major + stride1) : (src1_major - stride1));
+    const uint8_t* src2_major = src_ptr2 + ((y / inv_v2) * (size_t)stride2);
+    const uint8_t* src2_minor =
+        (inv_v2 != 2u)
+            ? src2_major
+            : ((y & 1u) ? (src2_major + stride2) : (src2_major - stride2));
+    uint32_t total_src_len0 = 0u;
+    uint32_t total_src_len1 = 0u;
+    uint32_t total_src_len2 = 0u;
+
+    uint32_t x = 0u;
+    while (x < width) {
+      bool first_column = x == 0u;
+      uint32_t end = x + 672u;
+      if (end > width) {
+        end = width;
+      }
+
+      uint32_t src_len0 = ((end - x) + inv_h0 - 1u) / inv_h0;
+      uint32_t src_len1 = ((end - x) + inv_h1 - 1u) / inv_h1;
+      uint32_t src_len2 = ((end - x) + inv_h2 - 1u) / inv_h2;
+      total_src_len0 += src_len0;
+      total_src_len1 += src_len1;
+      total_src_len2 += src_len2;
+
+      const uint8_t* up0 = (*upfunc0)(          //
+          scratch_buffer_2k_ptr + (0u * 672u),  //
+          src0_major + (x / inv_h0),            //
+          src0_minor + (x / inv_h0),            //
+          src_len0,                             //
+          h1v2_bias,                            //
+          first_column,                         //
+          (total_src_len0 >= half_width_for_2to1));
+
+      const uint8_t* up1 = (*upfunc1)(          //
+          scratch_buffer_2k_ptr + (1u * 672u),  //
+          src1_major + (x / inv_h1),            //
+          src1_minor + (x / inv_h1),            //
+          src_len1,                             //
+          h1v2_bias,                            //
+          first_column,                         //
+          (total_src_len1 >= half_width_for_2to1));
+
+      const uint8_t* up2 = (*upfunc2)(          //
+          scratch_buffer_2k_ptr + (2u * 672u),  //
+          src2_major + (x / inv_h2),            //
+          src2_minor + (x / inv_h2),            //
+          src_len2,                             //
+          h1v2_bias,                            //
+          first_column,                         //
+          (total_src_len2 >= half_width_for_2to1));
+
+      for (; x < end; x++) {
+        wuffs_base__pixel_buffer__set_color_u32_at(
+            dst, x, y,
+            wuffs_base__color_ycc__as__color_u32(*up0++, *up1++, *up2++));
+      }
+    }
+
+    h1v2_bias ^= 3u;
+  }
+
+  // Last row.
+  if (y_max_excl != height) {
+    wuffs_base__pixel_swizzler__swizzle_ycc__general__triangle_filter_edge_row(
+        dst, width, height - 1u,       //
         src_ptr0, src_ptr1, src_ptr2,  //
         stride0, stride1, stride2,     //
         inv_h0, inv_h1, inv_h2,        //
         inv_v0, inv_v1, inv_v2,        //
         half_width_for_2to1,           //
-        0u,                            //
+        h1v2_bias,                     //
         scratch_buffer_2k_ptr,         //
         upfunc0, upfunc1, upfunc2);
   }
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index af37bd6..931dbf5 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -22700,7 +22700,14 @@
 // Box filters are equivalent to nearest neighbor upsampling. These ignore the
 // src_ptr_minor, h1v2_bias, first_column and last_column arguments.
 //
-// TODO: triangle filters.
+// Triangle filters use a 3:1 ratio (in 1 dimension), or 9:3:3:1 (in 2
+// dimensions), which is higher quality (less blocky) but also higher
+// computational effort.
+//
+// In theory, we could use triangle filters for any (inv_h, inv_v) combination.
+// In practice, matching libjpeg-turbo, we only implement it for the common
+// chroma subsampling ratios (YCC420, YCC422 or YCC440), corresponding to an
+// (inv_h, inv_v) pair of (2, 2), (2, 1) or (1, 2).
 typedef const uint8_t* (
     *wuffs_base__pixel_swizzler__swizzle_ycc__upsample_func)(
     uint8_t* dst_ptr,
@@ -22783,19 +22790,159 @@
   return dst_ptr;
 }
 
+static const uint8_t*  //
+wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h1v2_triangle(
+    uint8_t* dst_ptr,
+    const uint8_t* src_ptr_major,
+    const uint8_t* src_ptr_minor,
+    size_t src_len,
+    uint32_t h1v2_bias,
+    bool first_column,
+    bool last_column) {
+  uint8_t* dp = dst_ptr;
+  const uint8_t* sp_major = src_ptr_major;
+  const uint8_t* sp_minor = src_ptr_minor;
+  while (src_len--) {
+    *dp++ = (uint8_t)(((3u * ((uint32_t)(*sp_major++))) +  //
+                       (1u * ((uint32_t)(*sp_minor++))) +  //
+                       h1v2_bias) >>
+                      2u);
+  }
+  return dst_ptr;
+}
+
+static const uint8_t*  //
+wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2v1_triangle(
+    uint8_t* dst_ptr,
+    const uint8_t* src_ptr_major,
+    const uint8_t* src_ptr_minor,
+    size_t src_len,
+    uint32_t h1v2_bias_ignored,
+    bool first_column,
+    bool last_column) {
+  uint8_t* dp = dst_ptr;
+  const uint8_t* sp = src_ptr_major;
+
+  if (first_column) {
+    src_len--;
+    if ((src_len <= 0u) && last_column) {
+      uint8_t sv = *sp++;
+      *dp++ = sv;
+      *dp++ = sv;
+      return dst_ptr;
+    }
+    uint32_t svp1 = sp[+1];
+    uint8_t sv = *sp++;
+    *dp++ = sv;
+    *dp++ = (uint8_t)(((3u * (uint32_t)sv) + svp1 + 2u) >> 2u);
+    if (src_len <= 0u) {
+      return dst_ptr;
+    }
+  }
+
+  if (last_column) {
+    src_len--;
+  }
+
+  for (; src_len > 0u; src_len--) {
+    uint32_t svm1 = sp[-1];
+    uint32_t svp1 = sp[+1];
+    uint32_t sv3 = 3u * (uint32_t)(*sp++);
+    *dp++ = (uint8_t)((sv3 + svm1 + 1u) >> 2u);
+    *dp++ = (uint8_t)((sv3 + svp1 + 2u) >> 2u);
+  }
+
+  if (last_column) {
+    uint32_t svm1 = sp[-1];
+    uint8_t sv = *sp++;
+    *dp++ = (uint8_t)(((3u * (uint32_t)sv) + svm1 + 1u) >> 2u);
+    *dp++ = sv;
+  }
+
+  return dst_ptr;
+}
+
+static const uint8_t*  //
+wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2v2_triangle(
+    uint8_t* dst_ptr,
+    const uint8_t* src_ptr_major,
+    const uint8_t* src_ptr_minor,
+    size_t src_len,
+    uint32_t h1v2_bias_ignored,
+    bool first_column,
+    bool last_column) {
+  uint8_t* dp = dst_ptr;
+  const uint8_t* sp_major = src_ptr_major;
+  const uint8_t* sp_minor = src_ptr_minor;
+
+  if (first_column) {
+    src_len--;
+    if ((src_len <= 0u) && last_column) {
+      uint32_t sv = (12u * ((uint32_t)(*sp_major++))) +  //
+                    (4u * ((uint32_t)(*sp_minor++)));
+      *dp++ = (uint8_t)((sv + 8u) >> 4u);
+      *dp++ = (uint8_t)((sv + 7u) >> 4u);
+      return dst_ptr;
+    }
+
+    uint32_t sv_major_m1 = sp_major[-0];  // Clamp offset to zero.
+    uint32_t sv_minor_m1 = sp_minor[-0];  // Clamp offset to zero.
+    uint32_t sv_major_p1 = sp_major[+1];
+    uint32_t sv_minor_p1 = sp_minor[+1];
+
+    uint32_t sv = (9u * ((uint32_t)(*sp_major++))) +  //
+                  (3u * ((uint32_t)(*sp_minor++)));
+    *dp++ = (uint8_t)((sv + (3u * sv_major_m1) + (sv_minor_m1) + 8u) >> 4u);
+    *dp++ = (uint8_t)((sv + (3u * sv_major_p1) + (sv_minor_p1) + 7u) >> 4u);
+    if (src_len <= 0u) {
+      return dst_ptr;
+    }
+  }
+
+  if (last_column) {
+    src_len--;
+  }
+
+  for (; src_len > 0u; src_len--) {
+    uint32_t sv_major_m1 = sp_major[-1];
+    uint32_t sv_minor_m1 = sp_minor[-1];
+    uint32_t sv_major_p1 = sp_major[+1];
+    uint32_t sv_minor_p1 = sp_minor[+1];
+
+    uint32_t sv = (9u * ((uint32_t)(*sp_major++))) +  //
+                  (3u * ((uint32_t)(*sp_minor++)));
+    *dp++ = (uint8_t)((sv + (3u * sv_major_m1) + (sv_minor_m1) + 8u) >> 4u);
+    *dp++ = (uint8_t)((sv + (3u * sv_major_p1) + (sv_minor_p1) + 7u) >> 4u);
+  }
+
+  if (last_column) {
+    uint32_t sv_major_m1 = sp_major[-1];
+    uint32_t sv_minor_m1 = sp_minor[-1];
+    uint32_t sv_major_p1 = sp_major[+0];  // Clamp offset to zero.
+    uint32_t sv_minor_p1 = sp_minor[+0];  // Clamp offset to zero.
+
+    uint32_t sv = (9u * ((uint32_t)(*sp_major++))) +  //
+                  (3u * ((uint32_t)(*sp_minor++)));
+    *dp++ = (uint8_t)((sv + (3u * sv_major_m1) + (sv_minor_m1) + 8u) >> 4u);
+    *dp++ = (uint8_t)((sv + (3u * sv_major_p1) + (sv_minor_p1) + 7u) >> 4u);
+  }
+
+  return dst_ptr;
+}
+
 // wuffs_base__pixel_swizzler__swizzle_ycc__upsample_funcs is indexed by inv_h
 // and then inv_v.
 static const wuffs_base__pixel_swizzler__swizzle_ycc__upsample_func
     wuffs_base__pixel_swizzler__swizzle_ycc__upsample_funcs[4][4] = {
         {
             wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h1vn_box,
-            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h1vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h1v2_triangle,
             wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h1vn_box,
             wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h1vn_box,
         },
         {
-            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2vn_box,
-            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2vn_box,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2v1_triangle,
+            wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2v2_triangle,
             wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2vn_box,
             wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2vn_box,
         },
@@ -22832,7 +22979,7 @@
 // example, (width > 0) is a precondition, but there are many more.
 
 static void  //
-wuffs_base__pixel_swizzler__swizzle_ycc__general__triangle_filter_single_row(
+wuffs_base__pixel_swizzler__swizzle_ycc__general__triangle_filter_edge_row(
     wuffs_base__pixel_buffer* dst,
     uint32_t width,
     uint32_t y,
@@ -22944,16 +23091,106 @@
       wuffs_base__pixel_swizzler__swizzle_ycc__upsample_funcs
           [(inv_h2 - 1u) & 3u][(inv_v2 - 1u) & 3u];
 
+  // First row.
+  uint32_t h1v2_bias = 1u;
+  wuffs_base__pixel_swizzler__swizzle_ycc__general__triangle_filter_edge_row(
+      dst, width, 0u,                //
+      src_ptr0, src_ptr1, src_ptr2,  //
+      stride0, stride1, stride2,     //
+      inv_h0, inv_h1, inv_h2,        //
+      inv_v0, inv_v1, inv_v2,        //
+      half_width_for_2to1,           //
+      h1v2_bias,                     //
+      scratch_buffer_2k_ptr,         //
+      upfunc0, upfunc1, upfunc2);
+  h1v2_bias = 2u;
+
+  // Middle rows.
+  bool last_row = height == 2u * half_height_for_2to1;
+  uint32_t y_max_excl = last_row ? (height - 1u) : height;
   uint32_t y;
-  for (y = 0u; y < height; y++) {
-    wuffs_base__pixel_swizzler__swizzle_ycc__general__triangle_filter_single_row(
-        dst, width, y,                 //
+  for (y = 1u; y < y_max_excl; y++) {
+    const uint8_t* src0_major = src_ptr0 + ((y / inv_v0) * (size_t)stride0);
+    const uint8_t* src0_minor =
+        (inv_v0 != 2u)
+            ? src0_major
+            : ((y & 1u) ? (src0_major + stride0) : (src0_major - stride0));
+    const uint8_t* src1_major = src_ptr1 + ((y / inv_v1) * (size_t)stride1);
+    const uint8_t* src1_minor =
+        (inv_v1 != 2u)
+            ? src1_major
+            : ((y & 1u) ? (src1_major + stride1) : (src1_major - stride1));
+    const uint8_t* src2_major = src_ptr2 + ((y / inv_v2) * (size_t)stride2);
+    const uint8_t* src2_minor =
+        (inv_v2 != 2u)
+            ? src2_major
+            : ((y & 1u) ? (src2_major + stride2) : (src2_major - stride2));
+    uint32_t total_src_len0 = 0u;
+    uint32_t total_src_len1 = 0u;
+    uint32_t total_src_len2 = 0u;
+
+    uint32_t x = 0u;
+    while (x < width) {
+      bool first_column = x == 0u;
+      uint32_t end = x + 672u;
+      if (end > width) {
+        end = width;
+      }
+
+      uint32_t src_len0 = ((end - x) + inv_h0 - 1u) / inv_h0;
+      uint32_t src_len1 = ((end - x) + inv_h1 - 1u) / inv_h1;
+      uint32_t src_len2 = ((end - x) + inv_h2 - 1u) / inv_h2;
+      total_src_len0 += src_len0;
+      total_src_len1 += src_len1;
+      total_src_len2 += src_len2;
+
+      const uint8_t* up0 = (*upfunc0)(          //
+          scratch_buffer_2k_ptr + (0u * 672u),  //
+          src0_major + (x / inv_h0),            //
+          src0_minor + (x / inv_h0),            //
+          src_len0,                             //
+          h1v2_bias,                            //
+          first_column,                         //
+          (total_src_len0 >= half_width_for_2to1));
+
+      const uint8_t* up1 = (*upfunc1)(          //
+          scratch_buffer_2k_ptr + (1u * 672u),  //
+          src1_major + (x / inv_h1),            //
+          src1_minor + (x / inv_h1),            //
+          src_len1,                             //
+          h1v2_bias,                            //
+          first_column,                         //
+          (total_src_len1 >= half_width_for_2to1));
+
+      const uint8_t* up2 = (*upfunc2)(          //
+          scratch_buffer_2k_ptr + (2u * 672u),  //
+          src2_major + (x / inv_h2),            //
+          src2_minor + (x / inv_h2),            //
+          src_len2,                             //
+          h1v2_bias,                            //
+          first_column,                         //
+          (total_src_len2 >= half_width_for_2to1));
+
+      for (; x < end; x++) {
+        wuffs_base__pixel_buffer__set_color_u32_at(
+            dst, x, y,
+            wuffs_base__color_ycc__as__color_u32(*up0++, *up1++, *up2++));
+      }
+    }
+
+    h1v2_bias ^= 3u;
+  }
+
+  // Last row.
+  if (y_max_excl != height) {
+    wuffs_base__pixel_swizzler__swizzle_ycc__general__triangle_filter_edge_row(
+        dst, width, height - 1u,       //
         src_ptr0, src_ptr1, src_ptr2,  //
         stride0, stride1, stride2,     //
         inv_h0, inv_h1, inv_h2,        //
         inv_v0, inv_v1, inv_v2,        //
         half_width_for_2to1,           //
-        0u,                            //
+        h1v2_bias,                     //
         scratch_buffer_2k_ptr,         //
         upfunc0, upfunc1, upfunc2);
   }
diff --git a/test/c/std/wbmp.c b/test/c/std/wbmp.c
index 7090e03..e9d75f9 100644
--- a/test/c/std/wbmp.c
+++ b/test/c/std/wbmp.c
@@ -523,14 +523,20 @@
   };
 
   const uint8_t want_array[32] = {
-      0x41, 0x41, 0x20, 0x20, 0x6C, 0x6C, 0x6F, 0x6F,  //
-      0x76, 0x76, 0x65, 0x65, 0x6C, 0x6C, 0x79, 0x79,  //
-      0x20, 0x20, 0x65, 0x65, 0x78, 0x78, 0x61, 0x61,  //
-      0x6D, 0x6D, 0x70, 0x70, 0x6C, 0x6C, 0x65, 0x65,  //
+      // A box filter (nearest neighbor) would look like this:
+      //   0x41, 0x41, 0x20, 0x20, 0x6C, 0x6C, 0x6F, 0x6F,
+      //   0x76, 0x76, 0x65, 0x65, 0x6C, 0x6C, 0x79, 0x79,
+      //   0x20, 0x20, 0x65, 0x65, 0x78, 0x78, 0x61, 0x61,
+      //   0x6D, 0x6D, 0x70, 0x70, 0x6C, 0x6C, 0x65, 0x65,
+      // We use a triangle filter instead, matching libjpeg-turbo.
+      0x41, 0x39, 0x28, 0x33, 0x59, 0x6D, 0x6E, 0x71,  //
+      0x74, 0x72, 0x69, 0x67, 0x6A, 0x6F, 0x76, 0x63,  //
+      0x36, 0x31, 0x54, 0x6A, 0x73, 0x72, 0x67, 0x64,  //
+      0x6A, 0x6E, 0x6F, 0x6F, 0x6D, 0x6A, 0x67, 0x65,  //
   };
 
   const uint8_t* have_ptr =
-      wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2vn_box(
+      wuffs_base__pixel_swizzler__swizzle_ycc__upsample_inv_h2v1_triangle(
           g_have_array_u8, src_array0, src_array0, 16, 0, true, true);
 
   const bool closed = true;
diff --git a/test/nia-checksums-of-data.txt b/test/nia-checksums-of-data.txt
index cd4fe1c..200d761 100644
--- a/test/nia-checksums-of-data.txt
+++ b/test/nia-checksums-of-data.txt
@@ -25,7 +25,7 @@
 e08a7cc8 test/data/artificial-png/exif.png
 e08a7cc8 test/data/artificial-png/key-value-pairs.png
 076cb375 test/data/bricks-color.bmp
-96a13918 test/data/bricks-color.jpeg
+72a1f9cc test/data/bricks-color.jpeg
 076cb375 test/data/bricks-color.png
 076cb375 test/data/bricks-color.tga
 f36c2e80 test/data/bricks-dither.bmp
@@ -49,11 +49,11 @@
 3014b4c0 test/data/gifplayer-muybridge.gif
 030f5a48 test/data/harvesters.bmp
 c18b3d5a test/data/harvesters.gif
-3f0a404d test/data/harvesters.jpeg
+f217df74 test/data/harvesters.jpeg
 030f5a48 test/data/harvesters.png
 e776c90f test/data/hat.bmp
 6dcba6a4 test/data/hat.gif
-75f4f686 test/data/hat.jpeg
+2298f3ca test/data/hat.jpeg
 e776c90f test/data/hat.png
 d30bfe5d test/data/hat.wbmp
 33a44f22 test/data/hibiscus.primitive.bmp
@@ -62,7 +62,7 @@
 33a44f22 test/data/hibiscus.primitive.png
 60040742 test/data/hibiscus.regular.bmp
 b727da8b test/data/hibiscus.regular.gif
-886ee1a1 test/data/hibiscus.regular.jpeg
+41e39405 test/data/hibiscus.regular.jpeg
 60040742 test/data/hibiscus.regular.png
 dcbb225a test/data/hippopotamus.bmp
 ed4b78fc test/data/hippopotamus.interlaced.gif