std/jpeg: implement Inverse DCT

Updates #42
diff --git a/internal/cgen/base/fundamental-private.h b/internal/cgen/base/fundamental-private.h
index 71d33f2..316720f 100644
--- a/internal/cgen/base/fundamental-private.h
+++ b/internal/cgen/base/fundamental-private.h
@@ -170,6 +170,14 @@
   *x = wuffs_base__u64__sat_sub(*x, y);
 }
 
+// ---------------- Numeric Types (Utility)
+
+#define wuffs_base__utility__sign_extend_convert_u16_u32(a) \
+  ((uint32_t)(int32_t)(int16_t)(a))
+
+#define wuffs_base__utility__sign_extend_rshift_u32(a, n) \
+  ((uint32_t)(((int32_t)(a)) >> (n)))
+
 // ---------------- Slices and Tables
 
 // wuffs_base__slice_u8__prefix returns up to the first up_to bytes of s.
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index 77a6ccd..e3a172a 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -386,6 +386,8 @@
 		"min_incl_x: u32, min_incl_y: u32, max_incl_x: u32, max_incl_y: u32) rect_ii_u32",
 	"utility.make_rect_ie_u32(" +
 		"min_incl_x: u32, min_incl_y: u32, max_excl_x: u32, max_excl_y: u32) rect_ie_u32",
+	"utility.sign_extend_convert_u16_u32(a: u16) u32",
+	"utility.sign_extend_rshift_u32(a: u32, n: u32[..= 31]) u32",
 
 	// ---- ranges
 
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 03c472e..bf4c165 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -11841,6 +11841,14 @@
   *x = wuffs_base__u64__sat_sub(*x, y);
 }
 
+// ---------------- Numeric Types (Utility)
+
+#define wuffs_base__utility__sign_extend_convert_u16_u32(a) \
+  ((uint32_t)(int32_t)(int16_t)(a))
+
+#define wuffs_base__utility__sign_extend_rshift_u32(a, n) \
+  ((uint32_t)(((int32_t)(a)) >> (n)))
+
 // ---------------- Slices and Tables
 
 // wuffs_base__slice_u8__prefix returns up to the first up_to bytes of s.
@@ -35399,10 +35407,150 @@
   63, 63, 63, 63, 63, 63, 63, 63,
 };
 
+static const uint8_t
+WUFFS_JPEG__BIAS_AND_CLAMP[1024] WUFFS_BASE__POTENTIALLY_UNUSED = {
+  128, 129, 130, 131, 132, 133, 134, 135,
+  136, 137, 138, 139, 140, 141, 142, 143,
+  144, 145, 146, 147, 148, 149, 150, 151,
+  152, 153, 154, 155, 156, 157, 158, 159,
+  160, 161, 162, 163, 164, 165, 166, 167,
+  168, 169, 170, 171, 172, 173, 174, 175,
+  176, 177, 178, 179, 180, 181, 182, 183,
+  184, 185, 186, 187, 188, 189, 190, 191,
+  192, 193, 194, 195, 196, 197, 198, 199,
+  200, 201, 202, 203, 204, 205, 206, 207,
+  208, 209, 210, 211, 212, 213, 214, 215,
+  216, 217, 218, 219, 220, 221, 222, 223,
+  224, 225, 226, 227, 228, 229, 230, 231,
+  232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247,
+  248, 249, 250, 251, 252, 253, 254, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0,
+  0, 1, 2, 3, 4, 5, 6, 7,
+  8, 9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23,
+  24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39,
+  40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55,
+  56, 57, 58, 59, 60, 61, 62, 63,
+  64, 65, 66, 67, 68, 69, 70, 71,
+  72, 73, 74, 75, 76, 77, 78, 79,
+  80, 81, 82, 83, 84, 85, 86, 87,
+  88, 89, 90, 91, 92, 93, 94, 95,
+  96, 97, 98, 99, 100, 101, 102, 103,
+  104, 105, 106, 107, 108, 109, 110, 111,
+  112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127,
+};
+
 // ---------------- Private Initializer Prototypes
 
 // ---------------- Private Function Prototypes
 
+static wuffs_base__empty_struct
+wuffs_jpeg__decoder__decode_idct(
+    wuffs_jpeg__decoder* self,
+    wuffs_base__slice_u8 a_dst_buffer,
+    uint64_t a_dst_stride,
+    uint32_t a_b,
+    uint32_t a_q);
+
 static wuffs_base__status
 wuffs_jpeg__decoder__do_decode_image_config(
     wuffs_jpeg__decoder* self,
@@ -35588,6 +35736,814 @@
 
 // ---------------- Function Implementations
 
+// -------- func jpeg.decoder.decode_idct
+
+static wuffs_base__empty_struct
+wuffs_jpeg__decoder__decode_idct(
+    wuffs_jpeg__decoder* self,
+    wuffs_base__slice_u8 a_dst_buffer,
+    uint64_t a_dst_stride,
+    uint32_t a_b,
+    uint32_t a_q) {
+  uint32_t v_bq0 = 0;
+  uint32_t v_bq2 = 0;
+  uint32_t v_bq4 = 0;
+  uint32_t v_bq6 = 0;
+  uint32_t v_ca = 0;
+  uint32_t v_cb2 = 0;
+  uint32_t v_cb6 = 0;
+  uint32_t v_ccp = 0;
+  uint32_t v_ccm = 0;
+  uint32_t v_cd0 = 0;
+  uint32_t v_cd1 = 0;
+  uint32_t v_cd2 = 0;
+  uint32_t v_cd3 = 0;
+  uint32_t v_bq1 = 0;
+  uint32_t v_bq3 = 0;
+  uint32_t v_bq5 = 0;
+  uint32_t v_bq7 = 0;
+  uint32_t v_ci51 = 0;
+  uint32_t v_ci53 = 0;
+  uint32_t v_ci71 = 0;
+  uint32_t v_ci73 = 0;
+  uint32_t v_cj = 0;
+  uint32_t v_ck1 = 0;
+  uint32_t v_ck3 = 0;
+  uint32_t v_ck5 = 0;
+  uint32_t v_ck7 = 0;
+  uint32_t v_cl51 = 0;
+  uint32_t v_cl73 = 0;
+  uint32_t v_in0 = 0;
+  uint32_t v_in2 = 0;
+  uint32_t v_in4 = 0;
+  uint32_t v_in6 = 0;
+  uint32_t v_ra = 0;
+  uint32_t v_rb2 = 0;
+  uint32_t v_rb6 = 0;
+  uint32_t v_rcp = 0;
+  uint32_t v_rcm = 0;
+  uint32_t v_rd0 = 0;
+  uint32_t v_rd1 = 0;
+  uint32_t v_rd2 = 0;
+  uint32_t v_rd3 = 0;
+  uint32_t v_in1 = 0;
+  uint32_t v_in3 = 0;
+  uint32_t v_in5 = 0;
+  uint32_t v_in7 = 0;
+  uint32_t v_ri51 = 0;
+  uint32_t v_ri53 = 0;
+  uint32_t v_ri71 = 0;
+  uint32_t v_ri73 = 0;
+  uint32_t v_rj = 0;
+  uint32_t v_rk1 = 0;
+  uint32_t v_rk3 = 0;
+  uint32_t v_rk5 = 0;
+  uint32_t v_rk7 = 0;
+  uint32_t v_rl51 = 0;
+  uint32_t v_rl73 = 0;
+  uint32_t v_intermediate[64] = {0};
+
+  if (8 > a_dst_stride) {
+    return wuffs_base__make_empty_struct();
+  }
+  v_bq2 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][16]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][16]))));
+  v_bq6 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][48]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][48]))));
+  v_ca = ((uint32_t)(((uint32_t)(v_bq2 + v_bq6)) * 4433));
+  v_cb2 = ((uint32_t)(v_ca + ((uint32_t)(v_bq2 * 6270))));
+  v_cb6 = ((uint32_t)(v_ca - ((uint32_t)(v_bq6 * 15137))));
+  v_bq0 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][0]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][0]))));
+  v_bq4 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][32]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][32]))));
+  v_ccp = ((uint32_t)(((uint32_t)(v_bq0 + v_bq4)) << 13));
+  v_ccm = ((uint32_t)(((uint32_t)(v_bq0 - v_bq4)) << 13));
+  v_cd0 = ((uint32_t)(v_ccp + v_cb2));
+  v_cd1 = ((uint32_t)(v_ccm + v_cb6));
+  v_cd2 = ((uint32_t)(v_ccm - v_cb6));
+  v_cd3 = ((uint32_t)(v_ccp - v_cb2));
+  v_bq1 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][8]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][8]))));
+  v_bq3 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][24]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][24]))));
+  v_bq5 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][40]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][40]))));
+  v_bq7 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][56]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][56]))));
+  v_ci51 = ((uint32_t)(v_bq5 + v_bq1));
+  v_ci53 = ((uint32_t)(v_bq5 + v_bq3));
+  v_ci71 = ((uint32_t)(v_bq7 + v_bq1));
+  v_ci73 = ((uint32_t)(v_bq7 + v_bq3));
+  v_cj = ((uint32_t)(((uint32_t)(v_ci73 + v_ci51)) * 9633));
+  v_ck1 = ((uint32_t)(v_bq1 * 12299));
+  v_ck3 = ((uint32_t)(v_bq3 * 25172));
+  v_ck5 = ((uint32_t)(v_bq5 * 16819));
+  v_ck7 = ((uint32_t)(v_bq7 * 2446));
+  v_ci51 *= 4294964100;
+  v_ci53 *= 4294946301;
+  v_ci71 *= 4294959923;
+  v_ci73 *= 4294951227;
+  v_cl51 = ((uint32_t)(v_ci51 + v_cj));
+  v_cl73 = ((uint32_t)(v_ci73 + v_cj));
+  v_ck1 += ((uint32_t)(v_ci71 + v_cl51));
+  v_ck3 += ((uint32_t)(v_ci53 + v_cl73));
+  v_ck5 += ((uint32_t)(v_ci53 + v_cl51));
+  v_ck7 += ((uint32_t)(v_ci71 + v_cl73));
+  v_intermediate[0] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd0 + v_ck1)) + 1024)), 11);
+  v_intermediate[56] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd0 - v_ck1)) + 1024)), 11);
+  v_intermediate[8] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd1 + v_ck3)) + 1024)), 11);
+  v_intermediate[48] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd1 - v_ck3)) + 1024)), 11);
+  v_intermediate[16] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd2 + v_ck5)) + 1024)), 11);
+  v_intermediate[40] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd2 - v_ck5)) + 1024)), 11);
+  v_intermediate[24] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd3 + v_ck7)) + 1024)), 11);
+  v_intermediate[32] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd3 - v_ck7)) + 1024)), 11);
+  v_bq2 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][17]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][17]))));
+  v_bq6 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][49]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][49]))));
+  v_ca = ((uint32_t)(((uint32_t)(v_bq2 + v_bq6)) * 4433));
+  v_cb2 = ((uint32_t)(v_ca + ((uint32_t)(v_bq2 * 6270))));
+  v_cb6 = ((uint32_t)(v_ca - ((uint32_t)(v_bq6 * 15137))));
+  v_bq0 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][1]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][1]))));
+  v_bq4 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][33]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][33]))));
+  v_ccp = ((uint32_t)(((uint32_t)(v_bq0 + v_bq4)) << 13));
+  v_ccm = ((uint32_t)(((uint32_t)(v_bq0 - v_bq4)) << 13));
+  v_cd0 = ((uint32_t)(v_ccp + v_cb2));
+  v_cd1 = ((uint32_t)(v_ccm + v_cb6));
+  v_cd2 = ((uint32_t)(v_ccm - v_cb6));
+  v_cd3 = ((uint32_t)(v_ccp - v_cb2));
+  v_bq1 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][9]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][9]))));
+  v_bq3 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][25]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][25]))));
+  v_bq5 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][41]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][41]))));
+  v_bq7 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][57]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][57]))));
+  v_ci51 = ((uint32_t)(v_bq5 + v_bq1));
+  v_ci53 = ((uint32_t)(v_bq5 + v_bq3));
+  v_ci71 = ((uint32_t)(v_bq7 + v_bq1));
+  v_ci73 = ((uint32_t)(v_bq7 + v_bq3));
+  v_cj = ((uint32_t)(((uint32_t)(v_ci73 + v_ci51)) * 9633));
+  v_ck1 = ((uint32_t)(v_bq1 * 12299));
+  v_ck3 = ((uint32_t)(v_bq3 * 25172));
+  v_ck5 = ((uint32_t)(v_bq5 * 16819));
+  v_ck7 = ((uint32_t)(v_bq7 * 2446));
+  v_ci51 *= 4294964100;
+  v_ci53 *= 4294946301;
+  v_ci71 *= 4294959923;
+  v_ci73 *= 4294951227;
+  v_cl51 = ((uint32_t)(v_ci51 + v_cj));
+  v_cl73 = ((uint32_t)(v_ci73 + v_cj));
+  v_ck1 += ((uint32_t)(v_ci71 + v_cl51));
+  v_ck3 += ((uint32_t)(v_ci53 + v_cl73));
+  v_ck5 += ((uint32_t)(v_ci53 + v_cl51));
+  v_ck7 += ((uint32_t)(v_ci71 + v_cl73));
+  v_intermediate[1] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd0 + v_ck1)) + 1024)), 11);
+  v_intermediate[57] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd0 - v_ck1)) + 1024)), 11);
+  v_intermediate[9] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd1 + v_ck3)) + 1024)), 11);
+  v_intermediate[49] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd1 - v_ck3)) + 1024)), 11);
+  v_intermediate[17] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd2 + v_ck5)) + 1024)), 11);
+  v_intermediate[41] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd2 - v_ck5)) + 1024)), 11);
+  v_intermediate[25] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd3 + v_ck7)) + 1024)), 11);
+  v_intermediate[33] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd3 - v_ck7)) + 1024)), 11);
+  v_bq2 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][18]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][18]))));
+  v_bq6 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][50]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][50]))));
+  v_ca = ((uint32_t)(((uint32_t)(v_bq2 + v_bq6)) * 4433));
+  v_cb2 = ((uint32_t)(v_ca + ((uint32_t)(v_bq2 * 6270))));
+  v_cb6 = ((uint32_t)(v_ca - ((uint32_t)(v_bq6 * 15137))));
+  v_bq0 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][2]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][2]))));
+  v_bq4 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][34]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][34]))));
+  v_ccp = ((uint32_t)(((uint32_t)(v_bq0 + v_bq4)) << 13));
+  v_ccm = ((uint32_t)(((uint32_t)(v_bq0 - v_bq4)) << 13));
+  v_cd0 = ((uint32_t)(v_ccp + v_cb2));
+  v_cd1 = ((uint32_t)(v_ccm + v_cb6));
+  v_cd2 = ((uint32_t)(v_ccm - v_cb6));
+  v_cd3 = ((uint32_t)(v_ccp - v_cb2));
+  v_bq1 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][10]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][10]))));
+  v_bq3 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][26]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][26]))));
+  v_bq5 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][42]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][42]))));
+  v_bq7 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][58]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][58]))));
+  v_ci51 = ((uint32_t)(v_bq5 + v_bq1));
+  v_ci53 = ((uint32_t)(v_bq5 + v_bq3));
+  v_ci71 = ((uint32_t)(v_bq7 + v_bq1));
+  v_ci73 = ((uint32_t)(v_bq7 + v_bq3));
+  v_cj = ((uint32_t)(((uint32_t)(v_ci73 + v_ci51)) * 9633));
+  v_ck1 = ((uint32_t)(v_bq1 * 12299));
+  v_ck3 = ((uint32_t)(v_bq3 * 25172));
+  v_ck5 = ((uint32_t)(v_bq5 * 16819));
+  v_ck7 = ((uint32_t)(v_bq7 * 2446));
+  v_ci51 *= 4294964100;
+  v_ci53 *= 4294946301;
+  v_ci71 *= 4294959923;
+  v_ci73 *= 4294951227;
+  v_cl51 = ((uint32_t)(v_ci51 + v_cj));
+  v_cl73 = ((uint32_t)(v_ci73 + v_cj));
+  v_ck1 += ((uint32_t)(v_ci71 + v_cl51));
+  v_ck3 += ((uint32_t)(v_ci53 + v_cl73));
+  v_ck5 += ((uint32_t)(v_ci53 + v_cl51));
+  v_ck7 += ((uint32_t)(v_ci71 + v_cl73));
+  v_intermediate[2] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd0 + v_ck1)) + 1024)), 11);
+  v_intermediate[58] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd0 - v_ck1)) + 1024)), 11);
+  v_intermediate[10] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd1 + v_ck3)) + 1024)), 11);
+  v_intermediate[50] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd1 - v_ck3)) + 1024)), 11);
+  v_intermediate[18] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd2 + v_ck5)) + 1024)), 11);
+  v_intermediate[42] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd2 - v_ck5)) + 1024)), 11);
+  v_intermediate[26] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd3 + v_ck7)) + 1024)), 11);
+  v_intermediate[34] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd3 - v_ck7)) + 1024)), 11);
+  v_bq2 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][19]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][19]))));
+  v_bq6 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][51]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][51]))));
+  v_ca = ((uint32_t)(((uint32_t)(v_bq2 + v_bq6)) * 4433));
+  v_cb2 = ((uint32_t)(v_ca + ((uint32_t)(v_bq2 * 6270))));
+  v_cb6 = ((uint32_t)(v_ca - ((uint32_t)(v_bq6 * 15137))));
+  v_bq0 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][3]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][3]))));
+  v_bq4 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][35]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][35]))));
+  v_ccp = ((uint32_t)(((uint32_t)(v_bq0 + v_bq4)) << 13));
+  v_ccm = ((uint32_t)(((uint32_t)(v_bq0 - v_bq4)) << 13));
+  v_cd0 = ((uint32_t)(v_ccp + v_cb2));
+  v_cd1 = ((uint32_t)(v_ccm + v_cb6));
+  v_cd2 = ((uint32_t)(v_ccm - v_cb6));
+  v_cd3 = ((uint32_t)(v_ccp - v_cb2));
+  v_bq1 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][11]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][11]))));
+  v_bq3 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][27]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][27]))));
+  v_bq5 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][43]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][43]))));
+  v_bq7 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][59]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][59]))));
+  v_ci51 = ((uint32_t)(v_bq5 + v_bq1));
+  v_ci53 = ((uint32_t)(v_bq5 + v_bq3));
+  v_ci71 = ((uint32_t)(v_bq7 + v_bq1));
+  v_ci73 = ((uint32_t)(v_bq7 + v_bq3));
+  v_cj = ((uint32_t)(((uint32_t)(v_ci73 + v_ci51)) * 9633));
+  v_ck1 = ((uint32_t)(v_bq1 * 12299));
+  v_ck3 = ((uint32_t)(v_bq3 * 25172));
+  v_ck5 = ((uint32_t)(v_bq5 * 16819));
+  v_ck7 = ((uint32_t)(v_bq7 * 2446));
+  v_ci51 *= 4294964100;
+  v_ci53 *= 4294946301;
+  v_ci71 *= 4294959923;
+  v_ci73 *= 4294951227;
+  v_cl51 = ((uint32_t)(v_ci51 + v_cj));
+  v_cl73 = ((uint32_t)(v_ci73 + v_cj));
+  v_ck1 += ((uint32_t)(v_ci71 + v_cl51));
+  v_ck3 += ((uint32_t)(v_ci53 + v_cl73));
+  v_ck5 += ((uint32_t)(v_ci53 + v_cl51));
+  v_ck7 += ((uint32_t)(v_ci71 + v_cl73));
+  v_intermediate[3] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd0 + v_ck1)) + 1024)), 11);
+  v_intermediate[59] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd0 - v_ck1)) + 1024)), 11);
+  v_intermediate[11] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd1 + v_ck3)) + 1024)), 11);
+  v_intermediate[51] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd1 - v_ck3)) + 1024)), 11);
+  v_intermediate[19] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd2 + v_ck5)) + 1024)), 11);
+  v_intermediate[43] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd2 - v_ck5)) + 1024)), 11);
+  v_intermediate[27] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd3 + v_ck7)) + 1024)), 11);
+  v_intermediate[35] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd3 - v_ck7)) + 1024)), 11);
+  v_bq2 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][20]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][20]))));
+  v_bq6 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][52]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][52]))));
+  v_ca = ((uint32_t)(((uint32_t)(v_bq2 + v_bq6)) * 4433));
+  v_cb2 = ((uint32_t)(v_ca + ((uint32_t)(v_bq2 * 6270))));
+  v_cb6 = ((uint32_t)(v_ca - ((uint32_t)(v_bq6 * 15137))));
+  v_bq0 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][4]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][4]))));
+  v_bq4 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][36]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][36]))));
+  v_ccp = ((uint32_t)(((uint32_t)(v_bq0 + v_bq4)) << 13));
+  v_ccm = ((uint32_t)(((uint32_t)(v_bq0 - v_bq4)) << 13));
+  v_cd0 = ((uint32_t)(v_ccp + v_cb2));
+  v_cd1 = ((uint32_t)(v_ccm + v_cb6));
+  v_cd2 = ((uint32_t)(v_ccm - v_cb6));
+  v_cd3 = ((uint32_t)(v_ccp - v_cb2));
+  v_bq1 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][12]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][12]))));
+  v_bq3 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][28]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][28]))));
+  v_bq5 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][44]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][44]))));
+  v_bq7 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][60]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][60]))));
+  v_ci51 = ((uint32_t)(v_bq5 + v_bq1));
+  v_ci53 = ((uint32_t)(v_bq5 + v_bq3));
+  v_ci71 = ((uint32_t)(v_bq7 + v_bq1));
+  v_ci73 = ((uint32_t)(v_bq7 + v_bq3));
+  v_cj = ((uint32_t)(((uint32_t)(v_ci73 + v_ci51)) * 9633));
+  v_ck1 = ((uint32_t)(v_bq1 * 12299));
+  v_ck3 = ((uint32_t)(v_bq3 * 25172));
+  v_ck5 = ((uint32_t)(v_bq5 * 16819));
+  v_ck7 = ((uint32_t)(v_bq7 * 2446));
+  v_ci51 *= 4294964100;
+  v_ci53 *= 4294946301;
+  v_ci71 *= 4294959923;
+  v_ci73 *= 4294951227;
+  v_cl51 = ((uint32_t)(v_ci51 + v_cj));
+  v_cl73 = ((uint32_t)(v_ci73 + v_cj));
+  v_ck1 += ((uint32_t)(v_ci71 + v_cl51));
+  v_ck3 += ((uint32_t)(v_ci53 + v_cl73));
+  v_ck5 += ((uint32_t)(v_ci53 + v_cl51));
+  v_ck7 += ((uint32_t)(v_ci71 + v_cl73));
+  v_intermediate[4] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd0 + v_ck1)) + 1024)), 11);
+  v_intermediate[60] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd0 - v_ck1)) + 1024)), 11);
+  v_intermediate[12] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd1 + v_ck3)) + 1024)), 11);
+  v_intermediate[52] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd1 - v_ck3)) + 1024)), 11);
+  v_intermediate[20] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd2 + v_ck5)) + 1024)), 11);
+  v_intermediate[44] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd2 - v_ck5)) + 1024)), 11);
+  v_intermediate[28] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd3 + v_ck7)) + 1024)), 11);
+  v_intermediate[36] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd3 - v_ck7)) + 1024)), 11);
+  v_bq2 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][21]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][21]))));
+  v_bq6 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][53]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][53]))));
+  v_ca = ((uint32_t)(((uint32_t)(v_bq2 + v_bq6)) * 4433));
+  v_cb2 = ((uint32_t)(v_ca + ((uint32_t)(v_bq2 * 6270))));
+  v_cb6 = ((uint32_t)(v_ca - ((uint32_t)(v_bq6 * 15137))));
+  v_bq0 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][5]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][5]))));
+  v_bq4 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][37]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][37]))));
+  v_ccp = ((uint32_t)(((uint32_t)(v_bq0 + v_bq4)) << 13));
+  v_ccm = ((uint32_t)(((uint32_t)(v_bq0 - v_bq4)) << 13));
+  v_cd0 = ((uint32_t)(v_ccp + v_cb2));
+  v_cd1 = ((uint32_t)(v_ccm + v_cb6));
+  v_cd2 = ((uint32_t)(v_ccm - v_cb6));
+  v_cd3 = ((uint32_t)(v_ccp - v_cb2));
+  v_bq1 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][13]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][13]))));
+  v_bq3 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][29]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][29]))));
+  v_bq5 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][45]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][45]))));
+  v_bq7 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][61]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][61]))));
+  v_ci51 = ((uint32_t)(v_bq5 + v_bq1));
+  v_ci53 = ((uint32_t)(v_bq5 + v_bq3));
+  v_ci71 = ((uint32_t)(v_bq7 + v_bq1));
+  v_ci73 = ((uint32_t)(v_bq7 + v_bq3));
+  v_cj = ((uint32_t)(((uint32_t)(v_ci73 + v_ci51)) * 9633));
+  v_ck1 = ((uint32_t)(v_bq1 * 12299));
+  v_ck3 = ((uint32_t)(v_bq3 * 25172));
+  v_ck5 = ((uint32_t)(v_bq5 * 16819));
+  v_ck7 = ((uint32_t)(v_bq7 * 2446));
+  v_ci51 *= 4294964100;
+  v_ci53 *= 4294946301;
+  v_ci71 *= 4294959923;
+  v_ci73 *= 4294951227;
+  v_cl51 = ((uint32_t)(v_ci51 + v_cj));
+  v_cl73 = ((uint32_t)(v_ci73 + v_cj));
+  v_ck1 += ((uint32_t)(v_ci71 + v_cl51));
+  v_ck3 += ((uint32_t)(v_ci53 + v_cl73));
+  v_ck5 += ((uint32_t)(v_ci53 + v_cl51));
+  v_ck7 += ((uint32_t)(v_ci71 + v_cl73));
+  v_intermediate[5] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd0 + v_ck1)) + 1024)), 11);
+  v_intermediate[61] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd0 - v_ck1)) + 1024)), 11);
+  v_intermediate[13] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd1 + v_ck3)) + 1024)), 11);
+  v_intermediate[53] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd1 - v_ck3)) + 1024)), 11);
+  v_intermediate[21] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd2 + v_ck5)) + 1024)), 11);
+  v_intermediate[45] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd2 - v_ck5)) + 1024)), 11);
+  v_intermediate[29] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd3 + v_ck7)) + 1024)), 11);
+  v_intermediate[37] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd3 - v_ck7)) + 1024)), 11);
+  v_bq2 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][22]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][22]))));
+  v_bq6 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][54]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][54]))));
+  v_ca = ((uint32_t)(((uint32_t)(v_bq2 + v_bq6)) * 4433));
+  v_cb2 = ((uint32_t)(v_ca + ((uint32_t)(v_bq2 * 6270))));
+  v_cb6 = ((uint32_t)(v_ca - ((uint32_t)(v_bq6 * 15137))));
+  v_bq0 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][6]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][6]))));
+  v_bq4 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][38]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][38]))));
+  v_ccp = ((uint32_t)(((uint32_t)(v_bq0 + v_bq4)) << 13));
+  v_ccm = ((uint32_t)(((uint32_t)(v_bq0 - v_bq4)) << 13));
+  v_cd0 = ((uint32_t)(v_ccp + v_cb2));
+  v_cd1 = ((uint32_t)(v_ccm + v_cb6));
+  v_cd2 = ((uint32_t)(v_ccm - v_cb6));
+  v_cd3 = ((uint32_t)(v_ccp - v_cb2));
+  v_bq1 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][14]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][14]))));
+  v_bq3 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][30]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][30]))));
+  v_bq5 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][46]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][46]))));
+  v_bq7 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][62]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][62]))));
+  v_ci51 = ((uint32_t)(v_bq5 + v_bq1));
+  v_ci53 = ((uint32_t)(v_bq5 + v_bq3));
+  v_ci71 = ((uint32_t)(v_bq7 + v_bq1));
+  v_ci73 = ((uint32_t)(v_bq7 + v_bq3));
+  v_cj = ((uint32_t)(((uint32_t)(v_ci73 + v_ci51)) * 9633));
+  v_ck1 = ((uint32_t)(v_bq1 * 12299));
+  v_ck3 = ((uint32_t)(v_bq3 * 25172));
+  v_ck5 = ((uint32_t)(v_bq5 * 16819));
+  v_ck7 = ((uint32_t)(v_bq7 * 2446));
+  v_ci51 *= 4294964100;
+  v_ci53 *= 4294946301;
+  v_ci71 *= 4294959923;
+  v_ci73 *= 4294951227;
+  v_cl51 = ((uint32_t)(v_ci51 + v_cj));
+  v_cl73 = ((uint32_t)(v_ci73 + v_cj));
+  v_ck1 += ((uint32_t)(v_ci71 + v_cl51));
+  v_ck3 += ((uint32_t)(v_ci53 + v_cl73));
+  v_ck5 += ((uint32_t)(v_ci53 + v_cl51));
+  v_ck7 += ((uint32_t)(v_ci71 + v_cl73));
+  v_intermediate[6] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd0 + v_ck1)) + 1024)), 11);
+  v_intermediate[62] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd0 - v_ck1)) + 1024)), 11);
+  v_intermediate[14] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd1 + v_ck3)) + 1024)), 11);
+  v_intermediate[54] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd1 - v_ck3)) + 1024)), 11);
+  v_intermediate[22] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd2 + v_ck5)) + 1024)), 11);
+  v_intermediate[46] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd2 - v_ck5)) + 1024)), 11);
+  v_intermediate[30] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd3 + v_ck7)) + 1024)), 11);
+  v_intermediate[38] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd3 - v_ck7)) + 1024)), 11);
+  v_bq2 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][23]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][23]))));
+  v_bq6 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][55]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][55]))));
+  v_ca = ((uint32_t)(((uint32_t)(v_bq2 + v_bq6)) * 4433));
+  v_cb2 = ((uint32_t)(v_ca + ((uint32_t)(v_bq2 * 6270))));
+  v_cb6 = ((uint32_t)(v_ca - ((uint32_t)(v_bq6 * 15137))));
+  v_bq0 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][7]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][7]))));
+  v_bq4 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][39]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][39]))));
+  v_ccp = ((uint32_t)(((uint32_t)(v_bq0 + v_bq4)) << 13));
+  v_ccm = ((uint32_t)(((uint32_t)(v_bq0 - v_bq4)) << 13));
+  v_cd0 = ((uint32_t)(v_ccp + v_cb2));
+  v_cd1 = ((uint32_t)(v_ccm + v_cb6));
+  v_cd2 = ((uint32_t)(v_ccm - v_cb6));
+  v_cd3 = ((uint32_t)(v_ccp - v_cb2));
+  v_bq1 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][15]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][15]))));
+  v_bq3 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][31]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][31]))));
+  v_bq5 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][47]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][47]))));
+  v_bq7 = ((uint32_t)(wuffs_base__utility__sign_extend_convert_u16_u32(self->private_data.f_mcu_blocks[a_b][63]) * ((uint32_t)(self->private_impl.f_quant_tables[a_q][63]))));
+  v_ci51 = ((uint32_t)(v_bq5 + v_bq1));
+  v_ci53 = ((uint32_t)(v_bq5 + v_bq3));
+  v_ci71 = ((uint32_t)(v_bq7 + v_bq1));
+  v_ci73 = ((uint32_t)(v_bq7 + v_bq3));
+  v_cj = ((uint32_t)(((uint32_t)(v_ci73 + v_ci51)) * 9633));
+  v_ck1 = ((uint32_t)(v_bq1 * 12299));
+  v_ck3 = ((uint32_t)(v_bq3 * 25172));
+  v_ck5 = ((uint32_t)(v_bq5 * 16819));
+  v_ck7 = ((uint32_t)(v_bq7 * 2446));
+  v_ci51 *= 4294964100;
+  v_ci53 *= 4294946301;
+  v_ci71 *= 4294959923;
+  v_ci73 *= 4294951227;
+  v_cl51 = ((uint32_t)(v_ci51 + v_cj));
+  v_cl73 = ((uint32_t)(v_ci73 + v_cj));
+  v_ck1 += ((uint32_t)(v_ci71 + v_cl51));
+  v_ck3 += ((uint32_t)(v_ci53 + v_cl73));
+  v_ck5 += ((uint32_t)(v_ci53 + v_cl51));
+  v_ck7 += ((uint32_t)(v_ci71 + v_cl73));
+  v_intermediate[7] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd0 + v_ck1)) + 1024)), 11);
+  v_intermediate[63] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd0 - v_ck1)) + 1024)), 11);
+  v_intermediate[15] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd1 + v_ck3)) + 1024)), 11);
+  v_intermediate[55] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd1 - v_ck3)) + 1024)), 11);
+  v_intermediate[23] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd2 + v_ck5)) + 1024)), 11);
+  v_intermediate[47] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd2 - v_ck5)) + 1024)), 11);
+  v_intermediate[31] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd3 + v_ck7)) + 1024)), 11);
+  v_intermediate[39] = wuffs_base__utility__sign_extend_rshift_u32(((uint32_t)(((uint32_t)(v_cd3 - v_ck7)) + 1024)), 11);
+  v_in2 = v_intermediate[2];
+  v_in6 = v_intermediate[6];
+  v_ra = ((uint32_t)(((uint32_t)(v_in2 + v_in6)) * 4433));
+  v_rb2 = ((uint32_t)(v_ra + ((uint32_t)(v_in2 * 6270))));
+  v_rb6 = ((uint32_t)(v_ra - ((uint32_t)(v_in6 * 15137))));
+  v_in0 = v_intermediate[0];
+  v_in4 = v_intermediate[4];
+  v_rcp = ((uint32_t)(((uint32_t)(v_in0 + v_in4)) << 13));
+  v_rcm = ((uint32_t)(((uint32_t)(v_in0 - v_in4)) << 13));
+  v_rd0 = ((uint32_t)(v_rcp + v_rb2));
+  v_rd1 = ((uint32_t)(v_rcm + v_rb6));
+  v_rd2 = ((uint32_t)(v_rcm - v_rb6));
+  v_rd3 = ((uint32_t)(v_rcp - v_rb2));
+  v_in1 = v_intermediate[1];
+  v_in3 = v_intermediate[3];
+  v_in5 = v_intermediate[5];
+  v_in7 = v_intermediate[7];
+  v_ri51 = ((uint32_t)(v_in5 + v_in1));
+  v_ri53 = ((uint32_t)(v_in5 + v_in3));
+  v_ri71 = ((uint32_t)(v_in7 + v_in1));
+  v_ri73 = ((uint32_t)(v_in7 + v_in3));
+  v_rj = ((uint32_t)(((uint32_t)(v_ri73 + v_ri51)) * 9633));
+  v_rk1 = ((uint32_t)(v_in1 * 12299));
+  v_rk3 = ((uint32_t)(v_in3 * 25172));
+  v_rk5 = ((uint32_t)(v_in5 * 16819));
+  v_rk7 = ((uint32_t)(v_in7 * 2446));
+  v_ri51 *= 4294964100;
+  v_ri53 *= 4294946301;
+  v_ri71 *= 4294959923;
+  v_ri73 *= 4294951227;
+  v_rl51 = ((uint32_t)(v_ri51 + v_rj));
+  v_rl73 = ((uint32_t)(v_ri73 + v_rj));
+  v_rk1 += ((uint32_t)(v_ri71 + v_rl51));
+  v_rk3 += ((uint32_t)(v_ri53 + v_rl73));
+  v_rk5 += ((uint32_t)(v_ri53 + v_rl51));
+  v_rk7 += ((uint32_t)(v_ri71 + v_rl73));
+  if (a_dst_stride > ((uint64_t)(a_dst_buffer.len))) {
+    return wuffs_base__make_empty_struct();
+  }
+  a_dst_buffer.ptr[0] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd0 + v_rk1)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[7] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd0 - v_rk1)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[1] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd1 + v_rk3)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[6] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd1 - v_rk3)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[2] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd2 + v_rk5)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[5] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd2 - v_rk5)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[3] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd3 + v_rk7)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[4] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd3 - v_rk7)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer = wuffs_base__slice_u8__subslice_i(a_dst_buffer, a_dst_stride);
+  v_in2 = v_intermediate[10];
+  v_in6 = v_intermediate[14];
+  v_ra = ((uint32_t)(((uint32_t)(v_in2 + v_in6)) * 4433));
+  v_rb2 = ((uint32_t)(v_ra + ((uint32_t)(v_in2 * 6270))));
+  v_rb6 = ((uint32_t)(v_ra - ((uint32_t)(v_in6 * 15137))));
+  v_in0 = v_intermediate[8];
+  v_in4 = v_intermediate[12];
+  v_rcp = ((uint32_t)(((uint32_t)(v_in0 + v_in4)) << 13));
+  v_rcm = ((uint32_t)(((uint32_t)(v_in0 - v_in4)) << 13));
+  v_rd0 = ((uint32_t)(v_rcp + v_rb2));
+  v_rd1 = ((uint32_t)(v_rcm + v_rb6));
+  v_rd2 = ((uint32_t)(v_rcm - v_rb6));
+  v_rd3 = ((uint32_t)(v_rcp - v_rb2));
+  v_in1 = v_intermediate[9];
+  v_in3 = v_intermediate[11];
+  v_in5 = v_intermediate[13];
+  v_in7 = v_intermediate[15];
+  v_ri51 = ((uint32_t)(v_in5 + v_in1));
+  v_ri53 = ((uint32_t)(v_in5 + v_in3));
+  v_ri71 = ((uint32_t)(v_in7 + v_in1));
+  v_ri73 = ((uint32_t)(v_in7 + v_in3));
+  v_rj = ((uint32_t)(((uint32_t)(v_ri73 + v_ri51)) * 9633));
+  v_rk1 = ((uint32_t)(v_in1 * 12299));
+  v_rk3 = ((uint32_t)(v_in3 * 25172));
+  v_rk5 = ((uint32_t)(v_in5 * 16819));
+  v_rk7 = ((uint32_t)(v_in7 * 2446));
+  v_ri51 *= 4294964100;
+  v_ri53 *= 4294946301;
+  v_ri71 *= 4294959923;
+  v_ri73 *= 4294951227;
+  v_rl51 = ((uint32_t)(v_ri51 + v_rj));
+  v_rl73 = ((uint32_t)(v_ri73 + v_rj));
+  v_rk1 += ((uint32_t)(v_ri71 + v_rl51));
+  v_rk3 += ((uint32_t)(v_ri53 + v_rl73));
+  v_rk5 += ((uint32_t)(v_ri53 + v_rl51));
+  v_rk7 += ((uint32_t)(v_ri71 + v_rl73));
+  if (a_dst_stride > ((uint64_t)(a_dst_buffer.len))) {
+    return wuffs_base__make_empty_struct();
+  }
+  a_dst_buffer.ptr[0] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd0 + v_rk1)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[7] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd0 - v_rk1)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[1] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd1 + v_rk3)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[6] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd1 - v_rk3)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[2] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd2 + v_rk5)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[5] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd2 - v_rk5)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[3] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd3 + v_rk7)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[4] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd3 - v_rk7)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer = wuffs_base__slice_u8__subslice_i(a_dst_buffer, a_dst_stride);
+  v_in2 = v_intermediate[18];
+  v_in6 = v_intermediate[22];
+  v_ra = ((uint32_t)(((uint32_t)(v_in2 + v_in6)) * 4433));
+  v_rb2 = ((uint32_t)(v_ra + ((uint32_t)(v_in2 * 6270))));
+  v_rb6 = ((uint32_t)(v_ra - ((uint32_t)(v_in6 * 15137))));
+  v_in0 = v_intermediate[16];
+  v_in4 = v_intermediate[20];
+  v_rcp = ((uint32_t)(((uint32_t)(v_in0 + v_in4)) << 13));
+  v_rcm = ((uint32_t)(((uint32_t)(v_in0 - v_in4)) << 13));
+  v_rd0 = ((uint32_t)(v_rcp + v_rb2));
+  v_rd1 = ((uint32_t)(v_rcm + v_rb6));
+  v_rd2 = ((uint32_t)(v_rcm - v_rb6));
+  v_rd3 = ((uint32_t)(v_rcp - v_rb2));
+  v_in1 = v_intermediate[17];
+  v_in3 = v_intermediate[19];
+  v_in5 = v_intermediate[21];
+  v_in7 = v_intermediate[23];
+  v_ri51 = ((uint32_t)(v_in5 + v_in1));
+  v_ri53 = ((uint32_t)(v_in5 + v_in3));
+  v_ri71 = ((uint32_t)(v_in7 + v_in1));
+  v_ri73 = ((uint32_t)(v_in7 + v_in3));
+  v_rj = ((uint32_t)(((uint32_t)(v_ri73 + v_ri51)) * 9633));
+  v_rk1 = ((uint32_t)(v_in1 * 12299));
+  v_rk3 = ((uint32_t)(v_in3 * 25172));
+  v_rk5 = ((uint32_t)(v_in5 * 16819));
+  v_rk7 = ((uint32_t)(v_in7 * 2446));
+  v_ri51 *= 4294964100;
+  v_ri53 *= 4294946301;
+  v_ri71 *= 4294959923;
+  v_ri73 *= 4294951227;
+  v_rl51 = ((uint32_t)(v_ri51 + v_rj));
+  v_rl73 = ((uint32_t)(v_ri73 + v_rj));
+  v_rk1 += ((uint32_t)(v_ri71 + v_rl51));
+  v_rk3 += ((uint32_t)(v_ri53 + v_rl73));
+  v_rk5 += ((uint32_t)(v_ri53 + v_rl51));
+  v_rk7 += ((uint32_t)(v_ri71 + v_rl73));
+  if (a_dst_stride > ((uint64_t)(a_dst_buffer.len))) {
+    return wuffs_base__make_empty_struct();
+  }
+  a_dst_buffer.ptr[0] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd0 + v_rk1)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[7] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd0 - v_rk1)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[1] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd1 + v_rk3)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[6] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd1 - v_rk3)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[2] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd2 + v_rk5)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[5] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd2 - v_rk5)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[3] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd3 + v_rk7)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[4] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd3 - v_rk7)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer = wuffs_base__slice_u8__subslice_i(a_dst_buffer, a_dst_stride);
+  v_in2 = v_intermediate[26];
+  v_in6 = v_intermediate[30];
+  v_ra = ((uint32_t)(((uint32_t)(v_in2 + v_in6)) * 4433));
+  v_rb2 = ((uint32_t)(v_ra + ((uint32_t)(v_in2 * 6270))));
+  v_rb6 = ((uint32_t)(v_ra - ((uint32_t)(v_in6 * 15137))));
+  v_in0 = v_intermediate[24];
+  v_in4 = v_intermediate[28];
+  v_rcp = ((uint32_t)(((uint32_t)(v_in0 + v_in4)) << 13));
+  v_rcm = ((uint32_t)(((uint32_t)(v_in0 - v_in4)) << 13));
+  v_rd0 = ((uint32_t)(v_rcp + v_rb2));
+  v_rd1 = ((uint32_t)(v_rcm + v_rb6));
+  v_rd2 = ((uint32_t)(v_rcm - v_rb6));
+  v_rd3 = ((uint32_t)(v_rcp - v_rb2));
+  v_in1 = v_intermediate[25];
+  v_in3 = v_intermediate[27];
+  v_in5 = v_intermediate[29];
+  v_in7 = v_intermediate[31];
+  v_ri51 = ((uint32_t)(v_in5 + v_in1));
+  v_ri53 = ((uint32_t)(v_in5 + v_in3));
+  v_ri71 = ((uint32_t)(v_in7 + v_in1));
+  v_ri73 = ((uint32_t)(v_in7 + v_in3));
+  v_rj = ((uint32_t)(((uint32_t)(v_ri73 + v_ri51)) * 9633));
+  v_rk1 = ((uint32_t)(v_in1 * 12299));
+  v_rk3 = ((uint32_t)(v_in3 * 25172));
+  v_rk5 = ((uint32_t)(v_in5 * 16819));
+  v_rk7 = ((uint32_t)(v_in7 * 2446));
+  v_ri51 *= 4294964100;
+  v_ri53 *= 4294946301;
+  v_ri71 *= 4294959923;
+  v_ri73 *= 4294951227;
+  v_rl51 = ((uint32_t)(v_ri51 + v_rj));
+  v_rl73 = ((uint32_t)(v_ri73 + v_rj));
+  v_rk1 += ((uint32_t)(v_ri71 + v_rl51));
+  v_rk3 += ((uint32_t)(v_ri53 + v_rl73));
+  v_rk5 += ((uint32_t)(v_ri53 + v_rl51));
+  v_rk7 += ((uint32_t)(v_ri71 + v_rl73));
+  if (a_dst_stride > ((uint64_t)(a_dst_buffer.len))) {
+    return wuffs_base__make_empty_struct();
+  }
+  a_dst_buffer.ptr[0] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd0 + v_rk1)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[7] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd0 - v_rk1)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[1] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd1 + v_rk3)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[6] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd1 - v_rk3)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[2] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd2 + v_rk5)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[5] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd2 - v_rk5)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[3] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd3 + v_rk7)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[4] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd3 - v_rk7)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer = wuffs_base__slice_u8__subslice_i(a_dst_buffer, a_dst_stride);
+  v_in2 = v_intermediate[34];
+  v_in6 = v_intermediate[38];
+  v_ra = ((uint32_t)(((uint32_t)(v_in2 + v_in6)) * 4433));
+  v_rb2 = ((uint32_t)(v_ra + ((uint32_t)(v_in2 * 6270))));
+  v_rb6 = ((uint32_t)(v_ra - ((uint32_t)(v_in6 * 15137))));
+  v_in0 = v_intermediate[32];
+  v_in4 = v_intermediate[36];
+  v_rcp = ((uint32_t)(((uint32_t)(v_in0 + v_in4)) << 13));
+  v_rcm = ((uint32_t)(((uint32_t)(v_in0 - v_in4)) << 13));
+  v_rd0 = ((uint32_t)(v_rcp + v_rb2));
+  v_rd1 = ((uint32_t)(v_rcm + v_rb6));
+  v_rd2 = ((uint32_t)(v_rcm - v_rb6));
+  v_rd3 = ((uint32_t)(v_rcp - v_rb2));
+  v_in1 = v_intermediate[33];
+  v_in3 = v_intermediate[35];
+  v_in5 = v_intermediate[37];
+  v_in7 = v_intermediate[39];
+  v_ri51 = ((uint32_t)(v_in5 + v_in1));
+  v_ri53 = ((uint32_t)(v_in5 + v_in3));
+  v_ri71 = ((uint32_t)(v_in7 + v_in1));
+  v_ri73 = ((uint32_t)(v_in7 + v_in3));
+  v_rj = ((uint32_t)(((uint32_t)(v_ri73 + v_ri51)) * 9633));
+  v_rk1 = ((uint32_t)(v_in1 * 12299));
+  v_rk3 = ((uint32_t)(v_in3 * 25172));
+  v_rk5 = ((uint32_t)(v_in5 * 16819));
+  v_rk7 = ((uint32_t)(v_in7 * 2446));
+  v_ri51 *= 4294964100;
+  v_ri53 *= 4294946301;
+  v_ri71 *= 4294959923;
+  v_ri73 *= 4294951227;
+  v_rl51 = ((uint32_t)(v_ri51 + v_rj));
+  v_rl73 = ((uint32_t)(v_ri73 + v_rj));
+  v_rk1 += ((uint32_t)(v_ri71 + v_rl51));
+  v_rk3 += ((uint32_t)(v_ri53 + v_rl73));
+  v_rk5 += ((uint32_t)(v_ri53 + v_rl51));
+  v_rk7 += ((uint32_t)(v_ri71 + v_rl73));
+  if (a_dst_stride > ((uint64_t)(a_dst_buffer.len))) {
+    return wuffs_base__make_empty_struct();
+  }
+  a_dst_buffer.ptr[0] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd0 + v_rk1)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[7] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd0 - v_rk1)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[1] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd1 + v_rk3)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[6] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd1 - v_rk3)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[2] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd2 + v_rk5)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[5] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd2 - v_rk5)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[3] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd3 + v_rk7)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[4] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd3 - v_rk7)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer = wuffs_base__slice_u8__subslice_i(a_dst_buffer, a_dst_stride);
+  v_in2 = v_intermediate[42];
+  v_in6 = v_intermediate[46];
+  v_ra = ((uint32_t)(((uint32_t)(v_in2 + v_in6)) * 4433));
+  v_rb2 = ((uint32_t)(v_ra + ((uint32_t)(v_in2 * 6270))));
+  v_rb6 = ((uint32_t)(v_ra - ((uint32_t)(v_in6 * 15137))));
+  v_in0 = v_intermediate[40];
+  v_in4 = v_intermediate[44];
+  v_rcp = ((uint32_t)(((uint32_t)(v_in0 + v_in4)) << 13));
+  v_rcm = ((uint32_t)(((uint32_t)(v_in0 - v_in4)) << 13));
+  v_rd0 = ((uint32_t)(v_rcp + v_rb2));
+  v_rd1 = ((uint32_t)(v_rcm + v_rb6));
+  v_rd2 = ((uint32_t)(v_rcm - v_rb6));
+  v_rd3 = ((uint32_t)(v_rcp - v_rb2));
+  v_in1 = v_intermediate[41];
+  v_in3 = v_intermediate[43];
+  v_in5 = v_intermediate[45];
+  v_in7 = v_intermediate[47];
+  v_ri51 = ((uint32_t)(v_in5 + v_in1));
+  v_ri53 = ((uint32_t)(v_in5 + v_in3));
+  v_ri71 = ((uint32_t)(v_in7 + v_in1));
+  v_ri73 = ((uint32_t)(v_in7 + v_in3));
+  v_rj = ((uint32_t)(((uint32_t)(v_ri73 + v_ri51)) * 9633));
+  v_rk1 = ((uint32_t)(v_in1 * 12299));
+  v_rk3 = ((uint32_t)(v_in3 * 25172));
+  v_rk5 = ((uint32_t)(v_in5 * 16819));
+  v_rk7 = ((uint32_t)(v_in7 * 2446));
+  v_ri51 *= 4294964100;
+  v_ri53 *= 4294946301;
+  v_ri71 *= 4294959923;
+  v_ri73 *= 4294951227;
+  v_rl51 = ((uint32_t)(v_ri51 + v_rj));
+  v_rl73 = ((uint32_t)(v_ri73 + v_rj));
+  v_rk1 += ((uint32_t)(v_ri71 + v_rl51));
+  v_rk3 += ((uint32_t)(v_ri53 + v_rl73));
+  v_rk5 += ((uint32_t)(v_ri53 + v_rl51));
+  v_rk7 += ((uint32_t)(v_ri71 + v_rl73));
+  if (a_dst_stride > ((uint64_t)(a_dst_buffer.len))) {
+    return wuffs_base__make_empty_struct();
+  }
+  a_dst_buffer.ptr[0] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd0 + v_rk1)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[7] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd0 - v_rk1)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[1] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd1 + v_rk3)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[6] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd1 - v_rk3)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[2] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd2 + v_rk5)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[5] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd2 - v_rk5)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[3] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd3 + v_rk7)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[4] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd3 - v_rk7)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer = wuffs_base__slice_u8__subslice_i(a_dst_buffer, a_dst_stride);
+  v_in2 = v_intermediate[50];
+  v_in6 = v_intermediate[54];
+  v_ra = ((uint32_t)(((uint32_t)(v_in2 + v_in6)) * 4433));
+  v_rb2 = ((uint32_t)(v_ra + ((uint32_t)(v_in2 * 6270))));
+  v_rb6 = ((uint32_t)(v_ra - ((uint32_t)(v_in6 * 15137))));
+  v_in0 = v_intermediate[48];
+  v_in4 = v_intermediate[52];
+  v_rcp = ((uint32_t)(((uint32_t)(v_in0 + v_in4)) << 13));
+  v_rcm = ((uint32_t)(((uint32_t)(v_in0 - v_in4)) << 13));
+  v_rd0 = ((uint32_t)(v_rcp + v_rb2));
+  v_rd1 = ((uint32_t)(v_rcm + v_rb6));
+  v_rd2 = ((uint32_t)(v_rcm - v_rb6));
+  v_rd3 = ((uint32_t)(v_rcp - v_rb2));
+  v_in1 = v_intermediate[49];
+  v_in3 = v_intermediate[51];
+  v_in5 = v_intermediate[53];
+  v_in7 = v_intermediate[55];
+  v_ri51 = ((uint32_t)(v_in5 + v_in1));
+  v_ri53 = ((uint32_t)(v_in5 + v_in3));
+  v_ri71 = ((uint32_t)(v_in7 + v_in1));
+  v_ri73 = ((uint32_t)(v_in7 + v_in3));
+  v_rj = ((uint32_t)(((uint32_t)(v_ri73 + v_ri51)) * 9633));
+  v_rk1 = ((uint32_t)(v_in1 * 12299));
+  v_rk3 = ((uint32_t)(v_in3 * 25172));
+  v_rk5 = ((uint32_t)(v_in5 * 16819));
+  v_rk7 = ((uint32_t)(v_in7 * 2446));
+  v_ri51 *= 4294964100;
+  v_ri53 *= 4294946301;
+  v_ri71 *= 4294959923;
+  v_ri73 *= 4294951227;
+  v_rl51 = ((uint32_t)(v_ri51 + v_rj));
+  v_rl73 = ((uint32_t)(v_ri73 + v_rj));
+  v_rk1 += ((uint32_t)(v_ri71 + v_rl51));
+  v_rk3 += ((uint32_t)(v_ri53 + v_rl73));
+  v_rk5 += ((uint32_t)(v_ri53 + v_rl51));
+  v_rk7 += ((uint32_t)(v_ri71 + v_rl73));
+  if (a_dst_stride > ((uint64_t)(a_dst_buffer.len))) {
+    return wuffs_base__make_empty_struct();
+  }
+  a_dst_buffer.ptr[0] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd0 + v_rk1)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[7] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd0 - v_rk1)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[1] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd1 + v_rk3)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[6] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd1 - v_rk3)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[2] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd2 + v_rk5)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[5] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd2 - v_rk5)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[3] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd3 + v_rk7)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[4] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd3 - v_rk7)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer = wuffs_base__slice_u8__subslice_i(a_dst_buffer, a_dst_stride);
+  v_in2 = v_intermediate[58];
+  v_in6 = v_intermediate[62];
+  v_ra = ((uint32_t)(((uint32_t)(v_in2 + v_in6)) * 4433));
+  v_rb2 = ((uint32_t)(v_ra + ((uint32_t)(v_in2 * 6270))));
+  v_rb6 = ((uint32_t)(v_ra - ((uint32_t)(v_in6 * 15137))));
+  v_in0 = v_intermediate[56];
+  v_in4 = v_intermediate[60];
+  v_rcp = ((uint32_t)(((uint32_t)(v_in0 + v_in4)) << 13));
+  v_rcm = ((uint32_t)(((uint32_t)(v_in0 - v_in4)) << 13));
+  v_rd0 = ((uint32_t)(v_rcp + v_rb2));
+  v_rd1 = ((uint32_t)(v_rcm + v_rb6));
+  v_rd2 = ((uint32_t)(v_rcm - v_rb6));
+  v_rd3 = ((uint32_t)(v_rcp - v_rb2));
+  v_in1 = v_intermediate[57];
+  v_in3 = v_intermediate[59];
+  v_in5 = v_intermediate[61];
+  v_in7 = v_intermediate[63];
+  v_ri51 = ((uint32_t)(v_in5 + v_in1));
+  v_ri53 = ((uint32_t)(v_in5 + v_in3));
+  v_ri71 = ((uint32_t)(v_in7 + v_in1));
+  v_ri73 = ((uint32_t)(v_in7 + v_in3));
+  v_rj = ((uint32_t)(((uint32_t)(v_ri73 + v_ri51)) * 9633));
+  v_rk1 = ((uint32_t)(v_in1 * 12299));
+  v_rk3 = ((uint32_t)(v_in3 * 25172));
+  v_rk5 = ((uint32_t)(v_in5 * 16819));
+  v_rk7 = ((uint32_t)(v_in7 * 2446));
+  v_ri51 *= 4294964100;
+  v_ri53 *= 4294946301;
+  v_ri71 *= 4294959923;
+  v_ri73 *= 4294951227;
+  v_rl51 = ((uint32_t)(v_ri51 + v_rj));
+  v_rl73 = ((uint32_t)(v_ri73 + v_rj));
+  v_rk1 += ((uint32_t)(v_ri71 + v_rl51));
+  v_rk3 += ((uint32_t)(v_ri53 + v_rl73));
+  v_rk5 += ((uint32_t)(v_ri53 + v_rl51));
+  v_rk7 += ((uint32_t)(v_ri71 + v_rl73));
+  if (8 > ((uint64_t)(a_dst_buffer.len))) {
+    return wuffs_base__make_empty_struct();
+  }
+  a_dst_buffer.ptr[0] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd0 + v_rk1)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[7] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd0 - v_rk1)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[1] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd1 + v_rk3)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[6] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd1 - v_rk3)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[2] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd2 + v_rk5)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[5] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd2 - v_rk5)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[3] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd3 + v_rk7)) + 131072)) >> 18) & 1023)];
+  a_dst_buffer.ptr[4] = WUFFS_JPEG__BIAS_AND_CLAMP[((((uint32_t)(((uint32_t)(v_rd3 - v_rk7)) + 131072)) >> 18) & 1023)];
+  return wuffs_base__make_empty_struct();
+}
+
 // -------- func jpeg.decoder.set_quirk
 
 WUFFS_BASE__MAYBE_STATIC wuffs_base__status
diff --git a/script/print-jpeg-idct-code.go b/script/print-jpeg-idct-code.go
new file mode 100644
index 0000000..76a16d1
--- /dev/null
+++ b/script/print-jpeg-idct-code.go
@@ -0,0 +1,323 @@
+// Copyright 2023 The Wuffs Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build ignore
+// +build ignore
+
+package main
+
+// print-jpeg-idct-code.go prints the "func jpeg.decoder.decode_idct" code.
+//
+// Usage: go run print-jpeg-idct-code.go
+
+import (
+	"fmt"
+	"os"
+	"strings"
+)
+
+func main() {
+	if err := main1(); err != nil {
+		os.Stderr.WriteString(err.Error() + "\n")
+		os.Exit(1)
+	}
+}
+
+func main1() error {
+	fmt.Printf("// -------- BEGIN generated by script/print-jpeg-idct-code.go\n")
+	fmt.Println()
+
+	fmt.Printf("// p0_298631336 = 0x%04X = %5d\n", round(p0_298631336), round(p0_298631336))
+	fmt.Printf("// p0_390180644 = 0x%04X = %5d\n", round(p0_390180644), round(p0_390180644))
+	fmt.Printf("// p0_541196100 = 0x%04X = %5d\n", round(p0_541196100), round(p0_541196100))
+	fmt.Printf("// p0_765366865 = 0x%04X = %5d\n", round(p0_765366865), round(p0_765366865))
+	fmt.Printf("// p0_899976223 = 0x%04X = %5d\n", round(p0_899976223), round(p0_899976223))
+	fmt.Printf("// p1_175875602 = 0x%04X = %5d\n", round(p1_175875602), round(p1_175875602))
+	fmt.Printf("// p1_501321110 = 0x%04X = %5d\n", round(p1_501321110), round(p1_501321110))
+	fmt.Printf("// p1_847759065 = 0x%04X = %5d\n", round(p1_847759065), round(p1_847759065))
+	fmt.Printf("// p1_961570560 = 0x%04X = %5d\n", round(p1_961570560), round(p1_961570560))
+	fmt.Printf("// p2_053119869 = 0x%04X = %5d\n", round(p2_053119869), round(p2_053119869))
+	fmt.Printf("// p2_562915447 = 0x%04X = %5d\n", round(p2_562915447), round(p2_562915447))
+	fmt.Printf("// p3_072711026 = 0x%04X = %5d\n", round(p3_072711026), round(p3_072711026))
+	fmt.Printf("//\n")
+	fmt.Printf("// m0_390180644 = 0x%08X = %10d\n", round(m0_390180644), round(m0_390180644))
+	fmt.Printf("// m0_899976223 = 0x%08X = %10d\n", round(m0_899976223), round(m0_899976223))
+	fmt.Printf("// m1_961570560 = 0x%08X = %10d\n", round(m1_961570560), round(m1_961570560))
+	fmt.Printf("// m2_562915447 = 0x%08X = %10d\n", round(m2_562915447), round(m2_562915447))
+	fmt.Println()
+
+	for x := 0; x < 8; x++ {
+		fmt.Println(strings.TrimSpace(replace(pass0, map[string]string{
+			"$colX$":         fmt.Sprint(x),
+			"$row0colX$":     fmt.Sprintf("0x%02X", (8*0)|x),
+			"$row1colX$":     fmt.Sprintf("0x%02X", (8*1)|x),
+			"$row2colX$":     fmt.Sprintf("0x%02X", (8*2)|x),
+			"$row3colX$":     fmt.Sprintf("0x%02X", (8*3)|x),
+			"$row4colX$":     fmt.Sprintf("0x%02X", (8*4)|x),
+			"$row5colX$":     fmt.Sprintf("0x%02X", (8*5)|x),
+			"$row6colX$":     fmt.Sprintf("0x%02X", (8*6)|x),
+			"$row7colX$":     fmt.Sprintf("0x%02X", (8*7)|x),
+			"$p0_298631336$": fmt.Sprintf("0x%04X", round(p0_298631336)),
+			"$p0_541196100$": fmt.Sprintf("0x%04X", round(p0_541196100)),
+			"$p0_765366865$": fmt.Sprintf("0x%04X", round(p0_765366865)),
+			"$p1_175875602$": fmt.Sprintf("0x%04X", round(p1_175875602)),
+			"$p1_501321110$": fmt.Sprintf("0x%04X", round(p1_501321110)),
+			"$p1_847759065$": fmt.Sprintf("0x%04X", round(p1_847759065)),
+			"$p2_053119869$": fmt.Sprintf("0x%04X", round(p2_053119869)),
+			"$p3_072711026$": fmt.Sprintf("0x%04X", round(p3_072711026)),
+			"$m0_390180644$": fmt.Sprintf("0x%08X", round(m0_390180644)),
+			"$m0_899976223$": fmt.Sprintf("0x%08X", round(m0_899976223)),
+			"$m1_961570560$": fmt.Sprintf("0x%08X", round(m1_961570560)),
+			"$m2_562915447$": fmt.Sprintf("0x%08X", round(m2_562915447)),
+		})))
+		fmt.Println()
+	}
+
+	for y := 0; y < 8; y++ {
+		fmt.Println(strings.TrimSpace(replace(pass1, map[string]string{
+			"$rowY$":         fmt.Sprint(y),
+			"$rowYcol0$":     fmt.Sprintf("0x%02X", (8*y)|0),
+			"$rowYcol1$":     fmt.Sprintf("0x%02X", (8*y)|1),
+			"$rowYcol2$":     fmt.Sprintf("0x%02X", (8*y)|2),
+			"$rowYcol3$":     fmt.Sprintf("0x%02X", (8*y)|3),
+			"$rowYcol4$":     fmt.Sprintf("0x%02X", (8*y)|4),
+			"$rowYcol5$":     fmt.Sprintf("0x%02X", (8*y)|5),
+			"$rowYcol6$":     fmt.Sprintf("0x%02X", (8*y)|6),
+			"$rowYcol7$":     fmt.Sprintf("0x%02X", (8*y)|7),
+			"$p0_298631336$": fmt.Sprintf("0x%04X", round(p0_298631336)),
+			"$p0_541196100$": fmt.Sprintf("0x%04X", round(p0_541196100)),
+			"$p0_765366865$": fmt.Sprintf("0x%04X", round(p0_765366865)),
+			"$p1_175875602$": fmt.Sprintf("0x%04X", round(p1_175875602)),
+			"$p1_501321110$": fmt.Sprintf("0x%04X", round(p1_501321110)),
+			"$p1_847759065$": fmt.Sprintf("0x%04X", round(p1_847759065)),
+			"$p2_053119869$": fmt.Sprintf("0x%04X", round(p2_053119869)),
+			"$p3_072711026$": fmt.Sprintf("0x%04X", round(p3_072711026)),
+			"$m0_390180644$": fmt.Sprintf("0x%08X", round(m0_390180644)),
+			"$m0_899976223$": fmt.Sprintf("0x%08X", round(m0_899976223)),
+			"$m1_961570560$": fmt.Sprintf("0x%08X", round(m1_961570560)),
+			"$m2_562915447$": fmt.Sprintf("0x%08X", round(m2_562915447)),
+			"$bounds_check$": boundsCheck(y == 7),
+			"$advance$":      advance(y == 7),
+		})))
+		fmt.Println()
+	}
+
+	fmt.Printf("// -------- END   generated by script/print-jpeg-idct-code.go\n")
+	return nil
+}
+
+func replace(s string, m map[string]string) string {
+	for k, v := range m {
+		s = strings.ReplaceAll(s, k, v)
+	}
+	return s
+}
+
+func round(x float64) uint32 {
+	return uint32(0.5 + (x * (1 << 13)))
+}
+
+func boundsCheck(final bool) string {
+	if final {
+		return "" +
+			"if 8 > args.dst_buffer.length() {\n" +
+			"    return nothing\n" +
+			"}"
+	}
+	return "" +
+		"if args.dst_stride > args.dst_buffer.length() {\n" +
+		"    return nothing\n" +
+		"}\n" +
+		`assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)`
+}
+
+func advance(final bool) string {
+	if final {
+		return ""
+	}
+	return "args.dst_buffer = args.dst_buffer[args.dst_stride ..]"
+}
+
+const (
+	sqrt2 = 1.4142135623730950488016887242096980785696718753769480731766797379
+
+	// cosNpi16 ≈ cos(N * (pi / 16)).
+	cos1pi16 = 0.9807852804032304491261822361342390369739337308933360950029160885
+	cos2pi16 = 0.9238795325112867561281831893967882868224166258636424861150977312
+	cos3pi16 = 0.8314696123025452370787883776179057567385608119872499634461245902
+	cos5pi16 = 0.5555702330196022247428308139485328743749371907548040459241535282
+	cos6pi16 = 0.3826834323650897717284599840303988667613445624856270414338006356
+	cos7pi16 = 0.1950903220161282678482848684770222409276916177519548077545020894
+)
+
+const (
+	p0_541196100 = sqrt2 * (+cos6pi16)
+	p1_175875602 = sqrt2 * (+cos3pi16)
+
+	p0_765366865 = sqrt2 * (+cos2pi16 - cos6pi16)
+	p1_847759065 = sqrt2 * (+cos2pi16 + cos6pi16)
+
+	p0_390180644 = sqrt2 * (+cos3pi16 - cos5pi16)
+	p0_899976223 = sqrt2 * (+cos3pi16 - cos7pi16)
+	p1_961570560 = sqrt2 * (+cos3pi16 + cos5pi16)
+	p2_562915447 = sqrt2 * (+cos3pi16 + cos1pi16)
+
+	p0_298631336 = sqrt2 * (-cos1pi16 + cos3pi16 + cos5pi16 - cos7pi16)
+	p1_501321110 = sqrt2 * (+cos1pi16 + cos3pi16 - cos5pi16 - cos7pi16)
+	p2_053119869 = sqrt2 * (+cos1pi16 + cos3pi16 - cos5pi16 + cos7pi16)
+	p3_072711026 = sqrt2 * (+cos1pi16 + cos3pi16 + cos5pi16 - cos7pi16)
+
+	m0_390180644 = (1 << (32 - 13)) - p0_390180644
+	m0_899976223 = (1 << (32 - 13)) - p0_899976223
+	m1_961570560 = (1 << (32 - 13)) - p1_961570560
+	m2_562915447 = (1 << (32 - 13)) - p2_562915447
+)
+
+const pass0 = `// ==== First pass, column $colX$.
+
+// Even rows.
+
+bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][$row2colX$]) ~mod* (this.quant_tables[args.q][$row2colX$] as base.u32)
+bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][$row6colX$]) ~mod* (this.quant_tables[args.q][$row6colX$] as base.u32)
+
+ca = (bq2 ~mod+ bq6) ~mod* $p0_541196100$
+
+cb2 = ca ~mod+ (bq2 ~mod* $p0_765366865$)
+cb6 = ca ~mod- (bq6 ~mod* $p1_847759065$)
+
+bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][$row0colX$]) ~mod* (this.quant_tables[args.q][$row0colX$] as base.u32)
+bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][$row4colX$]) ~mod* (this.quant_tables[args.q][$row4colX$] as base.u32)
+
+ccp = (bq0 ~mod+ bq4) ~mod<< 13
+ccm = (bq0 ~mod- bq4) ~mod<< 13
+
+cd0 = ccp ~mod+ cb2
+cd1 = ccm ~mod+ cb6
+cd2 = ccm ~mod- cb6
+cd3 = ccp ~mod- cb2
+
+// Odd rows.
+
+bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][$row1colX$]) ~mod* (this.quant_tables[args.q][$row1colX$] as base.u32)
+bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][$row3colX$]) ~mod* (this.quant_tables[args.q][$row3colX$] as base.u32)
+bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][$row5colX$]) ~mod* (this.quant_tables[args.q][$row5colX$] as base.u32)
+bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][$row7colX$]) ~mod* (this.quant_tables[args.q][$row7colX$] as base.u32)
+
+ci51 = bq5 ~mod+ bq1
+ci53 = bq5 ~mod+ bq3
+ci71 = bq7 ~mod+ bq1
+ci73 = bq7 ~mod+ bq3
+
+cj = (ci73 ~mod+ ci51) ~mod* $p1_175875602$
+
+ck1 = bq1 ~mod* $p1_501321110$
+ck3 = bq3 ~mod* $p3_072711026$
+ck5 = bq5 ~mod* $p2_053119869$
+ck7 = bq7 ~mod* $p0_298631336$
+
+ci51 ~mod*= $m0_390180644$
+ci53 ~mod*= $m2_562915447$
+ci71 ~mod*= $m0_899976223$
+ci73 ~mod*= $m1_961570560$
+
+cl51 = ci51 ~mod+ cj
+cl73 = ci73 ~mod+ cj
+
+ck1 ~mod+= ci71 ~mod+ cl51
+ck3 ~mod+= ci53 ~mod+ cl73
+ck5 ~mod+= ci53 ~mod+ cl51
+ck7 ~mod+= ci71 ~mod+ cl73
+
+// Combine rows.
+
+intermediate[$row0colX$] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11)
+intermediate[$row7colX$] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11)
+intermediate[$row1colX$] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11)
+intermediate[$row6colX$] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11)
+intermediate[$row2colX$] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11)
+intermediate[$row5colX$] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11)
+intermediate[$row3colX$] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11)
+intermediate[$row4colX$] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11)
+`
+
+const pass1 = `// ==== Second pass, row $rowY$.
+
+// Even columns.
+
+in2 = intermediate[$rowYcol2$]
+in6 = intermediate[$rowYcol6$]
+
+ra = (in2 ~mod+ in6) ~mod* $p0_541196100$
+
+rb2 = ra ~mod+ (in2 ~mod* $p0_765366865$)
+rb6 = ra ~mod- (in6 ~mod* $p1_847759065$)
+
+in0 = intermediate[$rowYcol0$]
+in4 = intermediate[$rowYcol4$]
+
+rcp = (in0 ~mod+ in4) ~mod<< 13
+rcm = (in0 ~mod- in4) ~mod<< 13
+
+rd0 = rcp ~mod+ rb2
+rd1 = rcm ~mod+ rb6
+rd2 = rcm ~mod- rb6
+rd3 = rcp ~mod- rb2
+
+// Odd columns.
+
+in1 = intermediate[$rowYcol1$]
+in3 = intermediate[$rowYcol3$]
+in5 = intermediate[$rowYcol5$]
+in7 = intermediate[$rowYcol7$]
+
+ri51 = in5 ~mod+ in1
+ri53 = in5 ~mod+ in3
+ri71 = in7 ~mod+ in1
+ri73 = in7 ~mod+ in3
+
+rj = (ri73 ~mod+ ri51) ~mod* $p1_175875602$
+
+rk1 = in1 ~mod* $p1_501321110$
+rk3 = in3 ~mod* $p3_072711026$
+rk5 = in5 ~mod* $p2_053119869$
+rk7 = in7 ~mod* $p0_298631336$
+
+ri51 ~mod*= $m0_390180644$
+ri53 ~mod*= $m2_562915447$
+ri71 ~mod*= $m0_899976223$
+ri73 ~mod*= $m1_961570560$
+
+rl51 = ri51 ~mod+ rj
+rl73 = ri73 ~mod+ rj
+
+rk1 ~mod+= ri71 ~mod+ rl51
+rk3 ~mod+= ri53 ~mod+ rl73
+rk5 ~mod+= ri53 ~mod+ rl51
+rk7 ~mod+= ri71 ~mod+ rl73
+
+// Combine columns.
+
+$bounds_check$
+
+args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023]
+args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023]
+args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023]
+args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023]
+args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023]
+args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023]
+args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023]
+args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023]
+
+$advance$
+`
diff --git a/std/jpeg/common_consts.wuffs b/std/jpeg/common_consts.wuffs
index e507bb2..fe007ac 100644
--- a/std/jpeg/common_consts.wuffs
+++ b/std/jpeg/common_consts.wuffs
@@ -30,3 +30,79 @@
         0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F,
         0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F,
 ]
+
+// BIAS_AND_CLAMP maps 10-bit signed values (0 centered, in the range -512 ..
+// +512) to 8-bit unsigned values (128 centered, in the range 0 .. 256).
+pri const BIAS_AND_CLAMP : roarray[1024] base.u8 = [
+        0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
+        0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
+        0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
+        0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
+        0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
+        0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
+        0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
+        0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
+
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
+        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
+        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
+        0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
+        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
+        0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
+        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
+]
diff --git a/std/jpeg/decode_idct_default.wuffs b/std/jpeg/decode_idct_default.wuffs
new file mode 100644
index 0000000..3cc5810
--- /dev/null
+++ b/std/jpeg/decode_idct_default.wuffs
@@ -0,0 +1,1230 @@
+// Copyright 2023 The Wuffs Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pri func decoder.decode_idct!(dst_buffer: slice base.u8, dst_stride: base.u64, b: base.u32[..= 9], q: base.u32[..= 3]) {
+    // This method implements the same algorithm as libjpeg-turbo's jidctint.c.
+
+    var bq0 : base.u32
+    var bq2 : base.u32
+    var bq4 : base.u32
+    var bq6 : base.u32
+
+    var ca : base.u32
+
+    var cb2 : base.u32
+    var cb6 : base.u32
+
+    var ccp : base.u32
+    var ccm : base.u32
+
+    var cd0 : base.u32
+    var cd1 : base.u32
+    var cd2 : base.u32
+    var cd3 : base.u32
+
+    var bq1 : base.u32
+    var bq3 : base.u32
+    var bq5 : base.u32
+    var bq7 : base.u32
+
+    var ci51 : base.u32
+    var ci53 : base.u32
+    var ci71 : base.u32
+    var ci73 : base.u32
+
+    var cj : base.u32
+
+    var ck1 : base.u32
+    var ck3 : base.u32
+    var ck5 : base.u32
+    var ck7 : base.u32
+
+    var cl51 : base.u32
+    var cl73 : base.u32
+
+    var in0 : base.u32
+    var in2 : base.u32
+    var in4 : base.u32
+    var in6 : base.u32
+
+    var ra : base.u32
+
+    var rb2 : base.u32
+    var rb6 : base.u32
+
+    var rcp : base.u32
+    var rcm : base.u32
+
+    var rd0 : base.u32
+    var rd1 : base.u32
+    var rd2 : base.u32
+    var rd3 : base.u32
+
+    var in1 : base.u32
+    var in3 : base.u32
+    var in5 : base.u32
+    var in7 : base.u32
+
+    var ri51 : base.u32
+    var ri53 : base.u32
+    var ri71 : base.u32
+    var ri73 : base.u32
+
+    var rj : base.u32
+
+    var rk1 : base.u32
+    var rk3 : base.u32
+    var rk5 : base.u32
+    var rk7 : base.u32
+
+    var rl51 : base.u32
+    var rl73 : base.u32
+
+    var intermediate : array[64] base.u32
+
+    if 8 > args.dst_stride {
+        return nothing
+    }
+
+    // -------- BEGIN generated by script/print-jpeg-idct-code.go
+
+    // p0_298631336 = 0x098E =  2446
+    // p0_390180644 = 0x0C7C =  3196
+    // p0_541196100 = 0x1151 =  4433
+    // p0_765366865 = 0x187E =  6270
+    // p0_899976223 = 0x1CCD =  7373
+    // p1_175875602 = 0x25A1 =  9633
+    // p1_501321110 = 0x300B = 12299
+    // p1_847759065 = 0x3B21 = 15137
+    // p1_961570560 = 0x3EC5 = 16069
+    // p2_053119869 = 0x41B3 = 16819
+    // p2_562915447 = 0x5203 = 20995
+    // p3_072711026 = 0x6254 = 25172
+    //
+    // m0_390180644 = 0xFFFFF384 = 4294964100
+    // m0_899976223 = 0xFFFFE333 = 4294959923
+    // m1_961570560 = 0xFFFFC13B = 4294951227
+    // m2_562915447 = 0xFFFFADFD = 4294946301
+
+    // ==== First pass, column 0.
+
+    // Even rows.
+
+    bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x10]) ~mod* (this.quant_tables[args.q][0x10] as base.u32)
+    bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x30]) ~mod* (this.quant_tables[args.q][0x30] as base.u32)
+
+    ca = (bq2 ~mod+ bq6) ~mod* 0x1151
+
+    cb2 = ca ~mod+ (bq2 ~mod* 0x187E)
+    cb6 = ca ~mod- (bq6 ~mod* 0x3B21)
+
+    bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x00]) ~mod* (this.quant_tables[args.q][0x00] as base.u32)
+    bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x20]) ~mod* (this.quant_tables[args.q][0x20] as base.u32)
+
+    ccp = (bq0 ~mod+ bq4) ~mod<< 13
+    ccm = (bq0 ~mod- bq4) ~mod<< 13
+
+    cd0 = ccp ~mod+ cb2
+    cd1 = ccm ~mod+ cb6
+    cd2 = ccm ~mod- cb6
+    cd3 = ccp ~mod- cb2
+
+    // Odd rows.
+
+    bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x08]) ~mod* (this.quant_tables[args.q][0x08] as base.u32)
+    bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x18]) ~mod* (this.quant_tables[args.q][0x18] as base.u32)
+    bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x28]) ~mod* (this.quant_tables[args.q][0x28] as base.u32)
+    bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x38]) ~mod* (this.quant_tables[args.q][0x38] as base.u32)
+
+    ci51 = bq5 ~mod+ bq1
+    ci53 = bq5 ~mod+ bq3
+    ci71 = bq7 ~mod+ bq1
+    ci73 = bq7 ~mod+ bq3
+
+    cj = (ci73 ~mod+ ci51) ~mod* 0x25A1
+
+    ck1 = bq1 ~mod* 0x300B
+    ck3 = bq3 ~mod* 0x6254
+    ck5 = bq5 ~mod* 0x41B3
+    ck7 = bq7 ~mod* 0x098E
+
+    ci51 ~mod*= 0xFFFF_F384
+    ci53 ~mod*= 0xFFFF_ADFD
+    ci71 ~mod*= 0xFFFF_E333
+    ci73 ~mod*= 0xFFFF_C13B
+
+    cl51 = ci51 ~mod+ cj
+    cl73 = ci73 ~mod+ cj
+
+    ck1 ~mod+= ci71 ~mod+ cl51
+    ck3 ~mod+= ci53 ~mod+ cl73
+    ck5 ~mod+= ci53 ~mod+ cl51
+    ck7 ~mod+= ci71 ~mod+ cl73
+
+    // Combine rows.
+
+    intermediate[0x00] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11)
+    intermediate[0x38] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11)
+    intermediate[0x08] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11)
+    intermediate[0x30] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11)
+    intermediate[0x10] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11)
+    intermediate[0x28] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11)
+    intermediate[0x18] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11)
+    intermediate[0x20] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11)
+
+    // ==== First pass, column 1.
+
+    // Even rows.
+
+    bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x11]) ~mod* (this.quant_tables[args.q][0x11] as base.u32)
+    bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x31]) ~mod* (this.quant_tables[args.q][0x31] as base.u32)
+
+    ca = (bq2 ~mod+ bq6) ~mod* 0x1151
+
+    cb2 = ca ~mod+ (bq2 ~mod* 0x187E)
+    cb6 = ca ~mod- (bq6 ~mod* 0x3B21)
+
+    bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x01]) ~mod* (this.quant_tables[args.q][0x01] as base.u32)
+    bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x21]) ~mod* (this.quant_tables[args.q][0x21] as base.u32)
+
+    ccp = (bq0 ~mod+ bq4) ~mod<< 13
+    ccm = (bq0 ~mod- bq4) ~mod<< 13
+
+    cd0 = ccp ~mod+ cb2
+    cd1 = ccm ~mod+ cb6
+    cd2 = ccm ~mod- cb6
+    cd3 = ccp ~mod- cb2
+
+    // Odd rows.
+
+    bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x09]) ~mod* (this.quant_tables[args.q][0x09] as base.u32)
+    bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x19]) ~mod* (this.quant_tables[args.q][0x19] as base.u32)
+    bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x29]) ~mod* (this.quant_tables[args.q][0x29] as base.u32)
+    bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x39]) ~mod* (this.quant_tables[args.q][0x39] as base.u32)
+
+    ci51 = bq5 ~mod+ bq1
+    ci53 = bq5 ~mod+ bq3
+    ci71 = bq7 ~mod+ bq1
+    ci73 = bq7 ~mod+ bq3
+
+    cj = (ci73 ~mod+ ci51) ~mod* 0x25A1
+
+    ck1 = bq1 ~mod* 0x300B
+    ck3 = bq3 ~mod* 0x6254
+    ck5 = bq5 ~mod* 0x41B3
+    ck7 = bq7 ~mod* 0x098E
+
+    ci51 ~mod*= 0xFFFF_F384
+    ci53 ~mod*= 0xFFFF_ADFD
+    ci71 ~mod*= 0xFFFF_E333
+    ci73 ~mod*= 0xFFFF_C13B
+
+    cl51 = ci51 ~mod+ cj
+    cl73 = ci73 ~mod+ cj
+
+    ck1 ~mod+= ci71 ~mod+ cl51
+    ck3 ~mod+= ci53 ~mod+ cl73
+    ck5 ~mod+= ci53 ~mod+ cl51
+    ck7 ~mod+= ci71 ~mod+ cl73
+
+    // Combine rows.
+
+    intermediate[0x01] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11)
+    intermediate[0x39] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11)
+    intermediate[0x09] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11)
+    intermediate[0x31] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11)
+    intermediate[0x11] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11)
+    intermediate[0x29] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11)
+    intermediate[0x19] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11)
+    intermediate[0x21] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11)
+
+    // ==== First pass, column 2.
+
+    // Even rows.
+
+    bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x12]) ~mod* (this.quant_tables[args.q][0x12] as base.u32)
+    bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x32]) ~mod* (this.quant_tables[args.q][0x32] as base.u32)
+
+    ca = (bq2 ~mod+ bq6) ~mod* 0x1151
+
+    cb2 = ca ~mod+ (bq2 ~mod* 0x187E)
+    cb6 = ca ~mod- (bq6 ~mod* 0x3B21)
+
+    bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x02]) ~mod* (this.quant_tables[args.q][0x02] as base.u32)
+    bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x22]) ~mod* (this.quant_tables[args.q][0x22] as base.u32)
+
+    ccp = (bq0 ~mod+ bq4) ~mod<< 13
+    ccm = (bq0 ~mod- bq4) ~mod<< 13
+
+    cd0 = ccp ~mod+ cb2
+    cd1 = ccm ~mod+ cb6
+    cd2 = ccm ~mod- cb6
+    cd3 = ccp ~mod- cb2
+
+    // Odd rows.
+
+    bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x0A]) ~mod* (this.quant_tables[args.q][0x0A] as base.u32)
+    bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x1A]) ~mod* (this.quant_tables[args.q][0x1A] as base.u32)
+    bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x2A]) ~mod* (this.quant_tables[args.q][0x2A] as base.u32)
+    bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x3A]) ~mod* (this.quant_tables[args.q][0x3A] as base.u32)
+
+    ci51 = bq5 ~mod+ bq1
+    ci53 = bq5 ~mod+ bq3
+    ci71 = bq7 ~mod+ bq1
+    ci73 = bq7 ~mod+ bq3
+
+    cj = (ci73 ~mod+ ci51) ~mod* 0x25A1
+
+    ck1 = bq1 ~mod* 0x300B
+    ck3 = bq3 ~mod* 0x6254
+    ck5 = bq5 ~mod* 0x41B3
+    ck7 = bq7 ~mod* 0x098E
+
+    ci51 ~mod*= 0xFFFF_F384
+    ci53 ~mod*= 0xFFFF_ADFD
+    ci71 ~mod*= 0xFFFF_E333
+    ci73 ~mod*= 0xFFFF_C13B
+
+    cl51 = ci51 ~mod+ cj
+    cl73 = ci73 ~mod+ cj
+
+    ck1 ~mod+= ci71 ~mod+ cl51
+    ck3 ~mod+= ci53 ~mod+ cl73
+    ck5 ~mod+= ci53 ~mod+ cl51
+    ck7 ~mod+= ci71 ~mod+ cl73
+
+    // Combine rows.
+
+    intermediate[0x02] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11)
+    intermediate[0x3A] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11)
+    intermediate[0x0A] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11)
+    intermediate[0x32] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11)
+    intermediate[0x12] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11)
+    intermediate[0x2A] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11)
+    intermediate[0x1A] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11)
+    intermediate[0x22] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11)
+
+    // ==== First pass, column 3.
+
+    // Even rows.
+
+    bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x13]) ~mod* (this.quant_tables[args.q][0x13] as base.u32)
+    bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x33]) ~mod* (this.quant_tables[args.q][0x33] as base.u32)
+
+    ca = (bq2 ~mod+ bq6) ~mod* 0x1151
+
+    cb2 = ca ~mod+ (bq2 ~mod* 0x187E)
+    cb6 = ca ~mod- (bq6 ~mod* 0x3B21)
+
+    bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x03]) ~mod* (this.quant_tables[args.q][0x03] as base.u32)
+    bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x23]) ~mod* (this.quant_tables[args.q][0x23] as base.u32)
+
+    ccp = (bq0 ~mod+ bq4) ~mod<< 13
+    ccm = (bq0 ~mod- bq4) ~mod<< 13
+
+    cd0 = ccp ~mod+ cb2
+    cd1 = ccm ~mod+ cb6
+    cd2 = ccm ~mod- cb6
+    cd3 = ccp ~mod- cb2
+
+    // Odd rows.
+
+    bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x0B]) ~mod* (this.quant_tables[args.q][0x0B] as base.u32)
+    bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x1B]) ~mod* (this.quant_tables[args.q][0x1B] as base.u32)
+    bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x2B]) ~mod* (this.quant_tables[args.q][0x2B] as base.u32)
+    bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x3B]) ~mod* (this.quant_tables[args.q][0x3B] as base.u32)
+
+    ci51 = bq5 ~mod+ bq1
+    ci53 = bq5 ~mod+ bq3
+    ci71 = bq7 ~mod+ bq1
+    ci73 = bq7 ~mod+ bq3
+
+    cj = (ci73 ~mod+ ci51) ~mod* 0x25A1
+
+    ck1 = bq1 ~mod* 0x300B
+    ck3 = bq3 ~mod* 0x6254
+    ck5 = bq5 ~mod* 0x41B3
+    ck7 = bq7 ~mod* 0x098E
+
+    ci51 ~mod*= 0xFFFF_F384
+    ci53 ~mod*= 0xFFFF_ADFD
+    ci71 ~mod*= 0xFFFF_E333
+    ci73 ~mod*= 0xFFFF_C13B
+
+    cl51 = ci51 ~mod+ cj
+    cl73 = ci73 ~mod+ cj
+
+    ck1 ~mod+= ci71 ~mod+ cl51
+    ck3 ~mod+= ci53 ~mod+ cl73
+    ck5 ~mod+= ci53 ~mod+ cl51
+    ck7 ~mod+= ci71 ~mod+ cl73
+
+    // Combine rows.
+
+    intermediate[0x03] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11)
+    intermediate[0x3B] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11)
+    intermediate[0x0B] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11)
+    intermediate[0x33] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11)
+    intermediate[0x13] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11)
+    intermediate[0x2B] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11)
+    intermediate[0x1B] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11)
+    intermediate[0x23] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11)
+
+    // ==== First pass, column 4.
+
+    // Even rows.
+
+    bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x14]) ~mod* (this.quant_tables[args.q][0x14] as base.u32)
+    bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x34]) ~mod* (this.quant_tables[args.q][0x34] as base.u32)
+
+    ca = (bq2 ~mod+ bq6) ~mod* 0x1151
+
+    cb2 = ca ~mod+ (bq2 ~mod* 0x187E)
+    cb6 = ca ~mod- (bq6 ~mod* 0x3B21)
+
+    bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x04]) ~mod* (this.quant_tables[args.q][0x04] as base.u32)
+    bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x24]) ~mod* (this.quant_tables[args.q][0x24] as base.u32)
+
+    ccp = (bq0 ~mod+ bq4) ~mod<< 13
+    ccm = (bq0 ~mod- bq4) ~mod<< 13
+
+    cd0 = ccp ~mod+ cb2
+    cd1 = ccm ~mod+ cb6
+    cd2 = ccm ~mod- cb6
+    cd3 = ccp ~mod- cb2
+
+    // Odd rows.
+
+    bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x0C]) ~mod* (this.quant_tables[args.q][0x0C] as base.u32)
+    bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x1C]) ~mod* (this.quant_tables[args.q][0x1C] as base.u32)
+    bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x2C]) ~mod* (this.quant_tables[args.q][0x2C] as base.u32)
+    bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x3C]) ~mod* (this.quant_tables[args.q][0x3C] as base.u32)
+
+    ci51 = bq5 ~mod+ bq1
+    ci53 = bq5 ~mod+ bq3
+    ci71 = bq7 ~mod+ bq1
+    ci73 = bq7 ~mod+ bq3
+
+    cj = (ci73 ~mod+ ci51) ~mod* 0x25A1
+
+    ck1 = bq1 ~mod* 0x300B
+    ck3 = bq3 ~mod* 0x6254
+    ck5 = bq5 ~mod* 0x41B3
+    ck7 = bq7 ~mod* 0x098E
+
+    ci51 ~mod*= 0xFFFF_F384
+    ci53 ~mod*= 0xFFFF_ADFD
+    ci71 ~mod*= 0xFFFF_E333
+    ci73 ~mod*= 0xFFFF_C13B
+
+    cl51 = ci51 ~mod+ cj
+    cl73 = ci73 ~mod+ cj
+
+    ck1 ~mod+= ci71 ~mod+ cl51
+    ck3 ~mod+= ci53 ~mod+ cl73
+    ck5 ~mod+= ci53 ~mod+ cl51
+    ck7 ~mod+= ci71 ~mod+ cl73
+
+    // Combine rows.
+
+    intermediate[0x04] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11)
+    intermediate[0x3C] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11)
+    intermediate[0x0C] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11)
+    intermediate[0x34] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11)
+    intermediate[0x14] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11)
+    intermediate[0x2C] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11)
+    intermediate[0x1C] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11)
+    intermediate[0x24] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11)
+
+    // ==== First pass, column 5.
+
+    // Even rows.
+
+    bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x15]) ~mod* (this.quant_tables[args.q][0x15] as base.u32)
+    bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x35]) ~mod* (this.quant_tables[args.q][0x35] as base.u32)
+
+    ca = (bq2 ~mod+ bq6) ~mod* 0x1151
+
+    cb2 = ca ~mod+ (bq2 ~mod* 0x187E)
+    cb6 = ca ~mod- (bq6 ~mod* 0x3B21)
+
+    bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x05]) ~mod* (this.quant_tables[args.q][0x05] as base.u32)
+    bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x25]) ~mod* (this.quant_tables[args.q][0x25] as base.u32)
+
+    ccp = (bq0 ~mod+ bq4) ~mod<< 13
+    ccm = (bq0 ~mod- bq4) ~mod<< 13
+
+    cd0 = ccp ~mod+ cb2
+    cd1 = ccm ~mod+ cb6
+    cd2 = ccm ~mod- cb6
+    cd3 = ccp ~mod- cb2
+
+    // Odd rows.
+
+    bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x0D]) ~mod* (this.quant_tables[args.q][0x0D] as base.u32)
+    bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x1D]) ~mod* (this.quant_tables[args.q][0x1D] as base.u32)
+    bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x2D]) ~mod* (this.quant_tables[args.q][0x2D] as base.u32)
+    bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x3D]) ~mod* (this.quant_tables[args.q][0x3D] as base.u32)
+
+    ci51 = bq5 ~mod+ bq1
+    ci53 = bq5 ~mod+ bq3
+    ci71 = bq7 ~mod+ bq1
+    ci73 = bq7 ~mod+ bq3
+
+    cj = (ci73 ~mod+ ci51) ~mod* 0x25A1
+
+    ck1 = bq1 ~mod* 0x300B
+    ck3 = bq3 ~mod* 0x6254
+    ck5 = bq5 ~mod* 0x41B3
+    ck7 = bq7 ~mod* 0x098E
+
+    ci51 ~mod*= 0xFFFF_F384
+    ci53 ~mod*= 0xFFFF_ADFD
+    ci71 ~mod*= 0xFFFF_E333
+    ci73 ~mod*= 0xFFFF_C13B
+
+    cl51 = ci51 ~mod+ cj
+    cl73 = ci73 ~mod+ cj
+
+    ck1 ~mod+= ci71 ~mod+ cl51
+    ck3 ~mod+= ci53 ~mod+ cl73
+    ck5 ~mod+= ci53 ~mod+ cl51
+    ck7 ~mod+= ci71 ~mod+ cl73
+
+    // Combine rows.
+
+    intermediate[0x05] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11)
+    intermediate[0x3D] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11)
+    intermediate[0x0D] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11)
+    intermediate[0x35] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11)
+    intermediate[0x15] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11)
+    intermediate[0x2D] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11)
+    intermediate[0x1D] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11)
+    intermediate[0x25] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11)
+
+    // ==== First pass, column 6.
+
+    // Even rows.
+
+    bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x16]) ~mod* (this.quant_tables[args.q][0x16] as base.u32)
+    bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x36]) ~mod* (this.quant_tables[args.q][0x36] as base.u32)
+
+    ca = (bq2 ~mod+ bq6) ~mod* 0x1151
+
+    cb2 = ca ~mod+ (bq2 ~mod* 0x187E)
+    cb6 = ca ~mod- (bq6 ~mod* 0x3B21)
+
+    bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x06]) ~mod* (this.quant_tables[args.q][0x06] as base.u32)
+    bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x26]) ~mod* (this.quant_tables[args.q][0x26] as base.u32)
+
+    ccp = (bq0 ~mod+ bq4) ~mod<< 13
+    ccm = (bq0 ~mod- bq4) ~mod<< 13
+
+    cd0 = ccp ~mod+ cb2
+    cd1 = ccm ~mod+ cb6
+    cd2 = ccm ~mod- cb6
+    cd3 = ccp ~mod- cb2
+
+    // Odd rows.
+
+    bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x0E]) ~mod* (this.quant_tables[args.q][0x0E] as base.u32)
+    bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x1E]) ~mod* (this.quant_tables[args.q][0x1E] as base.u32)
+    bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x2E]) ~mod* (this.quant_tables[args.q][0x2E] as base.u32)
+    bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x3E]) ~mod* (this.quant_tables[args.q][0x3E] as base.u32)
+
+    ci51 = bq5 ~mod+ bq1
+    ci53 = bq5 ~mod+ bq3
+    ci71 = bq7 ~mod+ bq1
+    ci73 = bq7 ~mod+ bq3
+
+    cj = (ci73 ~mod+ ci51) ~mod* 0x25A1
+
+    ck1 = bq1 ~mod* 0x300B
+    ck3 = bq3 ~mod* 0x6254
+    ck5 = bq5 ~mod* 0x41B3
+    ck7 = bq7 ~mod* 0x098E
+
+    ci51 ~mod*= 0xFFFF_F384
+    ci53 ~mod*= 0xFFFF_ADFD
+    ci71 ~mod*= 0xFFFF_E333
+    ci73 ~mod*= 0xFFFF_C13B
+
+    cl51 = ci51 ~mod+ cj
+    cl73 = ci73 ~mod+ cj
+
+    ck1 ~mod+= ci71 ~mod+ cl51
+    ck3 ~mod+= ci53 ~mod+ cl73
+    ck5 ~mod+= ci53 ~mod+ cl51
+    ck7 ~mod+= ci71 ~mod+ cl73
+
+    // Combine rows.
+
+    intermediate[0x06] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11)
+    intermediate[0x3E] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11)
+    intermediate[0x0E] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11)
+    intermediate[0x36] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11)
+    intermediate[0x16] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11)
+    intermediate[0x2E] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11)
+    intermediate[0x1E] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11)
+    intermediate[0x26] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11)
+
+    // ==== First pass, column 7.
+
+    // Even rows.
+
+    bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x17]) ~mod* (this.quant_tables[args.q][0x17] as base.u32)
+    bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x37]) ~mod* (this.quant_tables[args.q][0x37] as base.u32)
+
+    ca = (bq2 ~mod+ bq6) ~mod* 0x1151
+
+    cb2 = ca ~mod+ (bq2 ~mod* 0x187E)
+    cb6 = ca ~mod- (bq6 ~mod* 0x3B21)
+
+    bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x07]) ~mod* (this.quant_tables[args.q][0x07] as base.u32)
+    bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x27]) ~mod* (this.quant_tables[args.q][0x27] as base.u32)
+
+    ccp = (bq0 ~mod+ bq4) ~mod<< 13
+    ccm = (bq0 ~mod- bq4) ~mod<< 13
+
+    cd0 = ccp ~mod+ cb2
+    cd1 = ccm ~mod+ cb6
+    cd2 = ccm ~mod- cb6
+    cd3 = ccp ~mod- cb2
+
+    // Odd rows.
+
+    bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x0F]) ~mod* (this.quant_tables[args.q][0x0F] as base.u32)
+    bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x1F]) ~mod* (this.quant_tables[args.q][0x1F] as base.u32)
+    bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x2F]) ~mod* (this.quant_tables[args.q][0x2F] as base.u32)
+    bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[args.b][0x3F]) ~mod* (this.quant_tables[args.q][0x3F] as base.u32)
+
+    ci51 = bq5 ~mod+ bq1
+    ci53 = bq5 ~mod+ bq3
+    ci71 = bq7 ~mod+ bq1
+    ci73 = bq7 ~mod+ bq3
+
+    cj = (ci73 ~mod+ ci51) ~mod* 0x25A1
+
+    ck1 = bq1 ~mod* 0x300B
+    ck3 = bq3 ~mod* 0x6254
+    ck5 = bq5 ~mod* 0x41B3
+    ck7 = bq7 ~mod* 0x098E
+
+    ci51 ~mod*= 0xFFFF_F384
+    ci53 ~mod*= 0xFFFF_ADFD
+    ci71 ~mod*= 0xFFFF_E333
+    ci73 ~mod*= 0xFFFF_C13B
+
+    cl51 = ci51 ~mod+ cj
+    cl73 = ci73 ~mod+ cj
+
+    ck1 ~mod+= ci71 ~mod+ cl51
+    ck3 ~mod+= ci53 ~mod+ cl73
+    ck5 ~mod+= ci53 ~mod+ cl51
+    ck7 ~mod+= ci71 ~mod+ cl73
+
+    // Combine rows.
+
+    intermediate[0x07] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11)
+    intermediate[0x3F] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11)
+    intermediate[0x0F] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11)
+    intermediate[0x37] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11)
+    intermediate[0x17] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11)
+    intermediate[0x2F] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11)
+    intermediate[0x1F] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11)
+    intermediate[0x27] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11)
+
+    // ==== Second pass, row 0.
+
+    // Even columns.
+
+    in2 = intermediate[0x02]
+    in6 = intermediate[0x06]
+
+    ra = (in2 ~mod+ in6) ~mod* 0x1151
+
+    rb2 = ra ~mod+ (in2 ~mod* 0x187E)
+    rb6 = ra ~mod- (in6 ~mod* 0x3B21)
+
+    in0 = intermediate[0x00]
+    in4 = intermediate[0x04]
+
+    rcp = (in0 ~mod+ in4) ~mod<< 13
+    rcm = (in0 ~mod- in4) ~mod<< 13
+
+    rd0 = rcp ~mod+ rb2
+    rd1 = rcm ~mod+ rb6
+    rd2 = rcm ~mod- rb6
+    rd3 = rcp ~mod- rb2
+
+    // Odd columns.
+
+    in1 = intermediate[0x01]
+    in3 = intermediate[0x03]
+    in5 = intermediate[0x05]
+    in7 = intermediate[0x07]
+
+    ri51 = in5 ~mod+ in1
+    ri53 = in5 ~mod+ in3
+    ri71 = in7 ~mod+ in1
+    ri73 = in7 ~mod+ in3
+
+    rj = (ri73 ~mod+ ri51) ~mod* 0x25A1
+
+    rk1 = in1 ~mod* 0x300B
+    rk3 = in3 ~mod* 0x6254
+    rk5 = in5 ~mod* 0x41B3
+    rk7 = in7 ~mod* 0x098E
+
+    ri51 ~mod*= 0xFFFF_F384
+    ri53 ~mod*= 0xFFFF_ADFD
+    ri71 ~mod*= 0xFFFF_E333
+    ri73 ~mod*= 0xFFFF_C13B
+
+    rl51 = ri51 ~mod+ rj
+    rl73 = ri73 ~mod+ rj
+
+    rk1 ~mod+= ri71 ~mod+ rl51
+    rk3 ~mod+= ri53 ~mod+ rl73
+    rk5 ~mod+= ri53 ~mod+ rl51
+    rk7 ~mod+= ri71 ~mod+ rl73
+
+    // Combine columns.
+
+    if args.dst_stride > args.dst_buffer.length() {
+        return nothing
+    }
+    assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
+
+    args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023]
+
+    args.dst_buffer = args.dst_buffer[args.dst_stride ..]
+
+    // ==== Second pass, row 1.
+
+    // Even columns.
+
+    in2 = intermediate[0x0A]
+    in6 = intermediate[0x0E]
+
+    ra = (in2 ~mod+ in6) ~mod* 0x1151
+
+    rb2 = ra ~mod+ (in2 ~mod* 0x187E)
+    rb6 = ra ~mod- (in6 ~mod* 0x3B21)
+
+    in0 = intermediate[0x08]
+    in4 = intermediate[0x0C]
+
+    rcp = (in0 ~mod+ in4) ~mod<< 13
+    rcm = (in0 ~mod- in4) ~mod<< 13
+
+    rd0 = rcp ~mod+ rb2
+    rd1 = rcm ~mod+ rb6
+    rd2 = rcm ~mod- rb6
+    rd3 = rcp ~mod- rb2
+
+    // Odd columns.
+
+    in1 = intermediate[0x09]
+    in3 = intermediate[0x0B]
+    in5 = intermediate[0x0D]
+    in7 = intermediate[0x0F]
+
+    ri51 = in5 ~mod+ in1
+    ri53 = in5 ~mod+ in3
+    ri71 = in7 ~mod+ in1
+    ri73 = in7 ~mod+ in3
+
+    rj = (ri73 ~mod+ ri51) ~mod* 0x25A1
+
+    rk1 = in1 ~mod* 0x300B
+    rk3 = in3 ~mod* 0x6254
+    rk5 = in5 ~mod* 0x41B3
+    rk7 = in7 ~mod* 0x098E
+
+    ri51 ~mod*= 0xFFFF_F384
+    ri53 ~mod*= 0xFFFF_ADFD
+    ri71 ~mod*= 0xFFFF_E333
+    ri73 ~mod*= 0xFFFF_C13B
+
+    rl51 = ri51 ~mod+ rj
+    rl73 = ri73 ~mod+ rj
+
+    rk1 ~mod+= ri71 ~mod+ rl51
+    rk3 ~mod+= ri53 ~mod+ rl73
+    rk5 ~mod+= ri53 ~mod+ rl51
+    rk7 ~mod+= ri71 ~mod+ rl73
+
+    // Combine columns.
+
+    if args.dst_stride > args.dst_buffer.length() {
+        return nothing
+    }
+    assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
+
+    args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023]
+
+    args.dst_buffer = args.dst_buffer[args.dst_stride ..]
+
+    // ==== Second pass, row 2.
+
+    // Even columns.
+
+    in2 = intermediate[0x12]
+    in6 = intermediate[0x16]
+
+    ra = (in2 ~mod+ in6) ~mod* 0x1151
+
+    rb2 = ra ~mod+ (in2 ~mod* 0x187E)
+    rb6 = ra ~mod- (in6 ~mod* 0x3B21)
+
+    in0 = intermediate[0x10]
+    in4 = intermediate[0x14]
+
+    rcp = (in0 ~mod+ in4) ~mod<< 13
+    rcm = (in0 ~mod- in4) ~mod<< 13
+
+    rd0 = rcp ~mod+ rb2
+    rd1 = rcm ~mod+ rb6
+    rd2 = rcm ~mod- rb6
+    rd3 = rcp ~mod- rb2
+
+    // Odd columns.
+
+    in1 = intermediate[0x11]
+    in3 = intermediate[0x13]
+    in5 = intermediate[0x15]
+    in7 = intermediate[0x17]
+
+    ri51 = in5 ~mod+ in1
+    ri53 = in5 ~mod+ in3
+    ri71 = in7 ~mod+ in1
+    ri73 = in7 ~mod+ in3
+
+    rj = (ri73 ~mod+ ri51) ~mod* 0x25A1
+
+    rk1 = in1 ~mod* 0x300B
+    rk3 = in3 ~mod* 0x6254
+    rk5 = in5 ~mod* 0x41B3
+    rk7 = in7 ~mod* 0x098E
+
+    ri51 ~mod*= 0xFFFF_F384
+    ri53 ~mod*= 0xFFFF_ADFD
+    ri71 ~mod*= 0xFFFF_E333
+    ri73 ~mod*= 0xFFFF_C13B
+
+    rl51 = ri51 ~mod+ rj
+    rl73 = ri73 ~mod+ rj
+
+    rk1 ~mod+= ri71 ~mod+ rl51
+    rk3 ~mod+= ri53 ~mod+ rl73
+    rk5 ~mod+= ri53 ~mod+ rl51
+    rk7 ~mod+= ri71 ~mod+ rl73
+
+    // Combine columns.
+
+    if args.dst_stride > args.dst_buffer.length() {
+        return nothing
+    }
+    assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
+
+    args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023]
+
+    args.dst_buffer = args.dst_buffer[args.dst_stride ..]
+
+    // ==== Second pass, row 3.
+
+    // Even columns.
+
+    in2 = intermediate[0x1A]
+    in6 = intermediate[0x1E]
+
+    ra = (in2 ~mod+ in6) ~mod* 0x1151
+
+    rb2 = ra ~mod+ (in2 ~mod* 0x187E)
+    rb6 = ra ~mod- (in6 ~mod* 0x3B21)
+
+    in0 = intermediate[0x18]
+    in4 = intermediate[0x1C]
+
+    rcp = (in0 ~mod+ in4) ~mod<< 13
+    rcm = (in0 ~mod- in4) ~mod<< 13
+
+    rd0 = rcp ~mod+ rb2
+    rd1 = rcm ~mod+ rb6
+    rd2 = rcm ~mod- rb6
+    rd3 = rcp ~mod- rb2
+
+    // Odd columns.
+
+    in1 = intermediate[0x19]
+    in3 = intermediate[0x1B]
+    in5 = intermediate[0x1D]
+    in7 = intermediate[0x1F]
+
+    ri51 = in5 ~mod+ in1
+    ri53 = in5 ~mod+ in3
+    ri71 = in7 ~mod+ in1
+    ri73 = in7 ~mod+ in3
+
+    rj = (ri73 ~mod+ ri51) ~mod* 0x25A1
+
+    rk1 = in1 ~mod* 0x300B
+    rk3 = in3 ~mod* 0x6254
+    rk5 = in5 ~mod* 0x41B3
+    rk7 = in7 ~mod* 0x098E
+
+    ri51 ~mod*= 0xFFFF_F384
+    ri53 ~mod*= 0xFFFF_ADFD
+    ri71 ~mod*= 0xFFFF_E333
+    ri73 ~mod*= 0xFFFF_C13B
+
+    rl51 = ri51 ~mod+ rj
+    rl73 = ri73 ~mod+ rj
+
+    rk1 ~mod+= ri71 ~mod+ rl51
+    rk3 ~mod+= ri53 ~mod+ rl73
+    rk5 ~mod+= ri53 ~mod+ rl51
+    rk7 ~mod+= ri71 ~mod+ rl73
+
+    // Combine columns.
+
+    if args.dst_stride > args.dst_buffer.length() {
+        return nothing
+    }
+    assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
+
+    args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023]
+
+    args.dst_buffer = args.dst_buffer[args.dst_stride ..]
+
+    // ==== Second pass, row 4.
+
+    // Even columns.
+
+    in2 = intermediate[0x22]
+    in6 = intermediate[0x26]
+
+    ra = (in2 ~mod+ in6) ~mod* 0x1151
+
+    rb2 = ra ~mod+ (in2 ~mod* 0x187E)
+    rb6 = ra ~mod- (in6 ~mod* 0x3B21)
+
+    in0 = intermediate[0x20]
+    in4 = intermediate[0x24]
+
+    rcp = (in0 ~mod+ in4) ~mod<< 13
+    rcm = (in0 ~mod- in4) ~mod<< 13
+
+    rd0 = rcp ~mod+ rb2
+    rd1 = rcm ~mod+ rb6
+    rd2 = rcm ~mod- rb6
+    rd3 = rcp ~mod- rb2
+
+    // Odd columns.
+
+    in1 = intermediate[0x21]
+    in3 = intermediate[0x23]
+    in5 = intermediate[0x25]
+    in7 = intermediate[0x27]
+
+    ri51 = in5 ~mod+ in1
+    ri53 = in5 ~mod+ in3
+    ri71 = in7 ~mod+ in1
+    ri73 = in7 ~mod+ in3
+
+    rj = (ri73 ~mod+ ri51) ~mod* 0x25A1
+
+    rk1 = in1 ~mod* 0x300B
+    rk3 = in3 ~mod* 0x6254
+    rk5 = in5 ~mod* 0x41B3
+    rk7 = in7 ~mod* 0x098E
+
+    ri51 ~mod*= 0xFFFF_F384
+    ri53 ~mod*= 0xFFFF_ADFD
+    ri71 ~mod*= 0xFFFF_E333
+    ri73 ~mod*= 0xFFFF_C13B
+
+    rl51 = ri51 ~mod+ rj
+    rl73 = ri73 ~mod+ rj
+
+    rk1 ~mod+= ri71 ~mod+ rl51
+    rk3 ~mod+= ri53 ~mod+ rl73
+    rk5 ~mod+= ri53 ~mod+ rl51
+    rk7 ~mod+= ri71 ~mod+ rl73
+
+    // Combine columns.
+
+    if args.dst_stride > args.dst_buffer.length() {
+        return nothing
+    }
+    assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
+
+    args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023]
+
+    args.dst_buffer = args.dst_buffer[args.dst_stride ..]
+
+    // ==== Second pass, row 5.
+
+    // Even columns.
+
+    in2 = intermediate[0x2A]
+    in6 = intermediate[0x2E]
+
+    ra = (in2 ~mod+ in6) ~mod* 0x1151
+
+    rb2 = ra ~mod+ (in2 ~mod* 0x187E)
+    rb6 = ra ~mod- (in6 ~mod* 0x3B21)
+
+    in0 = intermediate[0x28]
+    in4 = intermediate[0x2C]
+
+    rcp = (in0 ~mod+ in4) ~mod<< 13
+    rcm = (in0 ~mod- in4) ~mod<< 13
+
+    rd0 = rcp ~mod+ rb2
+    rd1 = rcm ~mod+ rb6
+    rd2 = rcm ~mod- rb6
+    rd3 = rcp ~mod- rb2
+
+    // Odd columns.
+
+    in1 = intermediate[0x29]
+    in3 = intermediate[0x2B]
+    in5 = intermediate[0x2D]
+    in7 = intermediate[0x2F]
+
+    ri51 = in5 ~mod+ in1
+    ri53 = in5 ~mod+ in3
+    ri71 = in7 ~mod+ in1
+    ri73 = in7 ~mod+ in3
+
+    rj = (ri73 ~mod+ ri51) ~mod* 0x25A1
+
+    rk1 = in1 ~mod* 0x300B
+    rk3 = in3 ~mod* 0x6254
+    rk5 = in5 ~mod* 0x41B3
+    rk7 = in7 ~mod* 0x098E
+
+    ri51 ~mod*= 0xFFFF_F384
+    ri53 ~mod*= 0xFFFF_ADFD
+    ri71 ~mod*= 0xFFFF_E333
+    ri73 ~mod*= 0xFFFF_C13B
+
+    rl51 = ri51 ~mod+ rj
+    rl73 = ri73 ~mod+ rj
+
+    rk1 ~mod+= ri71 ~mod+ rl51
+    rk3 ~mod+= ri53 ~mod+ rl73
+    rk5 ~mod+= ri53 ~mod+ rl51
+    rk7 ~mod+= ri71 ~mod+ rl73
+
+    // Combine columns.
+
+    if args.dst_stride > args.dst_buffer.length() {
+        return nothing
+    }
+    assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
+
+    args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023]
+
+    args.dst_buffer = args.dst_buffer[args.dst_stride ..]
+
+    // ==== Second pass, row 6.
+
+    // Even columns.
+
+    in2 = intermediate[0x32]
+    in6 = intermediate[0x36]
+
+    ra = (in2 ~mod+ in6) ~mod* 0x1151
+
+    rb2 = ra ~mod+ (in2 ~mod* 0x187E)
+    rb6 = ra ~mod- (in6 ~mod* 0x3B21)
+
+    in0 = intermediate[0x30]
+    in4 = intermediate[0x34]
+
+    rcp = (in0 ~mod+ in4) ~mod<< 13
+    rcm = (in0 ~mod- in4) ~mod<< 13
+
+    rd0 = rcp ~mod+ rb2
+    rd1 = rcm ~mod+ rb6
+    rd2 = rcm ~mod- rb6
+    rd3 = rcp ~mod- rb2
+
+    // Odd columns.
+
+    in1 = intermediate[0x31]
+    in3 = intermediate[0x33]
+    in5 = intermediate[0x35]
+    in7 = intermediate[0x37]
+
+    ri51 = in5 ~mod+ in1
+    ri53 = in5 ~mod+ in3
+    ri71 = in7 ~mod+ in1
+    ri73 = in7 ~mod+ in3
+
+    rj = (ri73 ~mod+ ri51) ~mod* 0x25A1
+
+    rk1 = in1 ~mod* 0x300B
+    rk3 = in3 ~mod* 0x6254
+    rk5 = in5 ~mod* 0x41B3
+    rk7 = in7 ~mod* 0x098E
+
+    ri51 ~mod*= 0xFFFF_F384
+    ri53 ~mod*= 0xFFFF_ADFD
+    ri71 ~mod*= 0xFFFF_E333
+    ri73 ~mod*= 0xFFFF_C13B
+
+    rl51 = ri51 ~mod+ rj
+    rl73 = ri73 ~mod+ rj
+
+    rk1 ~mod+= ri71 ~mod+ rl51
+    rk3 ~mod+= ri53 ~mod+ rl73
+    rk5 ~mod+= ri53 ~mod+ rl51
+    rk7 ~mod+= ri71 ~mod+ rl73
+
+    // Combine columns.
+
+    if args.dst_stride > args.dst_buffer.length() {
+        return nothing
+    }
+    assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
+
+    args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023]
+
+    args.dst_buffer = args.dst_buffer[args.dst_stride ..]
+
+    // ==== Second pass, row 7.
+
+    // Even columns.
+
+    in2 = intermediate[0x3A]
+    in6 = intermediate[0x3E]
+
+    ra = (in2 ~mod+ in6) ~mod* 0x1151
+
+    rb2 = ra ~mod+ (in2 ~mod* 0x187E)
+    rb6 = ra ~mod- (in6 ~mod* 0x3B21)
+
+    in0 = intermediate[0x38]
+    in4 = intermediate[0x3C]
+
+    rcp = (in0 ~mod+ in4) ~mod<< 13
+    rcm = (in0 ~mod- in4) ~mod<< 13
+
+    rd0 = rcp ~mod+ rb2
+    rd1 = rcm ~mod+ rb6
+    rd2 = rcm ~mod- rb6
+    rd3 = rcp ~mod- rb2
+
+    // Odd columns.
+
+    in1 = intermediate[0x39]
+    in3 = intermediate[0x3B]
+    in5 = intermediate[0x3D]
+    in7 = intermediate[0x3F]
+
+    ri51 = in5 ~mod+ in1
+    ri53 = in5 ~mod+ in3
+    ri71 = in7 ~mod+ in1
+    ri73 = in7 ~mod+ in3
+
+    rj = (ri73 ~mod+ ri51) ~mod* 0x25A1
+
+    rk1 = in1 ~mod* 0x300B
+    rk3 = in3 ~mod* 0x6254
+    rk5 = in5 ~mod* 0x41B3
+    rk7 = in7 ~mod* 0x098E
+
+    ri51 ~mod*= 0xFFFF_F384
+    ri53 ~mod*= 0xFFFF_ADFD
+    ri71 ~mod*= 0xFFFF_E333
+    ri73 ~mod*= 0xFFFF_C13B
+
+    rl51 = ri51 ~mod+ rj
+    rl73 = ri73 ~mod+ rj
+
+    rk1 ~mod+= ri71 ~mod+ rl51
+    rk3 ~mod+= ri53 ~mod+ rl73
+    rk5 ~mod+= ri53 ~mod+ rl51
+    rk7 ~mod+= ri71 ~mod+ rl73
+
+    // Combine columns.
+
+    if 8 > args.dst_buffer.length() {
+        return nothing
+    }
+
+    args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023]
+    args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023]
+
+    // -------- END   generated by script/print-jpeg-idct-code.go
+}
diff --git a/test/c/std/jpeg.c b/test/c/std/jpeg.c
index cffac60..81696bb 100644
--- a/test/c/std/jpeg.c
+++ b/test/c/std/jpeg.c
@@ -532,6 +532,72 @@
 }
 
 const char*  //
+test_wuffs_jpeg_decode_idct() {
+  CHECK_FOCUS(__func__);
+
+  // This is "test/data/bricks-color.jpeg"'s first MCU's first block, in
+  // natural (not zig-zag) order.
+  const uint16_t mcu_block[64] = {
+      0xFFC9, 0xFFD8, 0x0014, 0xFFF7, 0x0002, 0x0000, 0x0000, 0x0000,  //
+      0x006A, 0xFFE3, 0x001C, 0xFFF9, 0x0002, 0x0000, 0x0000, 0x0000,  //
+      0x0015, 0x0002, 0x0002, 0xFFFE, 0x0001, 0x0000, 0x0000, 0x0001,  //
+      0x000D, 0xFFEC, 0x0005, 0xFFFE, 0x0000, 0x0000, 0x0000, 0x0000,  //
+      0xFFFA, 0xFFFA, 0x0002, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000,  //
+      0x0001, 0xFFFD, 0x0001, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,  //
+      0x0000, 0x0001, 0x0001, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,  //
+      0x0001, 0x0001, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,  //
+  };
+
+  // This is "test/data/bricks-color.jpeg"'s first quantization table, in
+  // natural (not zig-zag) order.
+  const uint8_t quant_table[64] = {
+      0x03, 0x02, 0x02, 0x03, 0x04, 0x06, 0x08, 0x0A,  //
+      0x02, 0x02, 0x02, 0x03, 0x04, 0x09, 0x0A, 0x09,  //
+      0x02, 0x02, 0x03, 0x04, 0x06, 0x09, 0x0B, 0x09,  //
+      0x02, 0x03, 0x04, 0x05, 0x08, 0x0E, 0x0D, 0x0A,  //
+      0x03, 0x04, 0x06, 0x09, 0x0B, 0x11, 0x10, 0x0C,  //
+      0x04, 0x06, 0x09, 0x0A, 0x0D, 0x11, 0x12, 0x0F,  //
+      0x08, 0x0A, 0x0C, 0x0E, 0x10, 0x13, 0x13, 0x10,  //
+      0x0C, 0x0F, 0x0F, 0x10, 0x12, 0x10, 0x10, 0x10,  //
+  };
+
+  // This is the IDCT's expected result (including dequantization), again in
+  // natural (not zig-zag) order.
+  const uint8_t want_array[64] = {
+      0x81, 0x7E, 0x82, 0x7E, 0x82, 0x92, 0xC5, 0xF2,  //
+      0x81, 0x80, 0x84, 0x85, 0x85, 0x88, 0x9D, 0xB2,  //
+      0x86, 0x81, 0x7A, 0x77, 0x72, 0x75, 0x7E, 0x8A,  //
+      0x54, 0x58, 0x58, 0x5E, 0x5E, 0x6C, 0x79, 0x87,  //
+      0x4D, 0x54, 0x56, 0x5B, 0x59, 0x65, 0x6E, 0x7A,  //
+      0x4A, 0x4D, 0x4F, 0x53, 0x56, 0x5F, 0x67, 0x6E,  //
+      0x4A, 0x4D, 0x54, 0x58, 0x5B, 0x58, 0x56, 0x54,  //
+      0x4C, 0x4C, 0x52, 0x4F, 0x4D, 0x40, 0x3A, 0x35,  //
+  };
+
+  wuffs_jpeg__decoder dec;
+  CHECK_STATUS("initialize", wuffs_jpeg__decoder__initialize(
+                                 &dec, sizeof dec, WUFFS_VERSION,
+                                 WUFFS_INITIALIZE__DEFAULT_OPTIONS));
+
+  const uint32_t b = 0;
+  memcpy(&dec.private_data.f_mcu_blocks[b], mcu_block, sizeof(mcu_block));
+
+  const uint32_t q = 0;
+  memcpy(&dec.private_impl.f_quant_tables[q], quant_table, sizeof(quant_table));
+
+  uint8_t dst_array[64] = {0};
+  wuffs_jpeg__decoder__decode_idct(
+      &dec, wuffs_base__make_slice_u8(&dst_array[0], 64), 8, b, q);
+
+  wuffs_base__io_buffer have =
+      wuffs_base__ptr_u8__reader(&dst_array[0], 64, true);
+  wuffs_base__io_buffer want =
+      wuffs_base__ptr_u8__reader((void*)(&want_array[0]), 64, true);
+
+  return check_io_buffers_equal("", &have, &want);
+}
+
+const char*  //
 test_wuffs_jpeg_decode_mcu() {
   CHECK_FOCUS(__func__);
 
@@ -705,6 +771,7 @@
 
     test_wuffs_jpeg_decode_dht_easy,
     test_wuffs_jpeg_decode_dht_hard,
+    test_wuffs_jpeg_decode_idct,
     test_wuffs_jpeg_decode_mcu,
     test_wuffs_jpeg_decode_interface,
     test_wuffs_jpeg_decode_truncated_input,