std/jpeg: prepare to decode progressive JPEGs
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 0dc4fbd..e808ab0 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -8901,6 +8901,7 @@
     } s_decode_sos[1];
     struct {
       uint32_t v_i;
+      uint64_t scratch;
     } s_prepare_scan[1];
   } private_data;
 
@@ -38316,6 +38317,10 @@
   bool v_has_v24 = false;
   bool v_has_v3 = false;
   uint32_t v_upper_bound = 0;
+  uint64_t v_wh0 = 0;
+  uint64_t v_wh1 = 0;
+  uint64_t v_wh2 = 0;
+  uint64_t v_wh3 = 0;
 
   const uint8_t* iop_a_src = NULL;
   const uint8_t* io0_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
@@ -38559,11 +38564,15 @@
     self->private_impl.f_components_workbuf_heights[1] = wuffs_base__u32__min(v_upper_bound, (8 * self->private_impl.f_height_in_mcus * ((uint32_t)(self->private_impl.f_components_v[1]))));
     self->private_impl.f_components_workbuf_heights[2] = wuffs_base__u32__min(v_upper_bound, (8 * self->private_impl.f_height_in_mcus * ((uint32_t)(self->private_impl.f_components_v[2]))));
     self->private_impl.f_components_workbuf_heights[3] = wuffs_base__u32__min(v_upper_bound, (8 * self->private_impl.f_height_in_mcus * ((uint32_t)(self->private_impl.f_components_v[3]))));
+    v_wh0 = (((uint64_t)(self->private_impl.f_components_workbuf_widths[0])) * ((uint64_t)(self->private_impl.f_components_workbuf_heights[0])));
+    v_wh1 = (((uint64_t)(self->private_impl.f_components_workbuf_widths[1])) * ((uint64_t)(self->private_impl.f_components_workbuf_heights[1])));
+    v_wh2 = (((uint64_t)(self->private_impl.f_components_workbuf_widths[2])) * ((uint64_t)(self->private_impl.f_components_workbuf_heights[2])));
+    v_wh3 = (((uint64_t)(self->private_impl.f_components_workbuf_widths[3])) * ((uint64_t)(self->private_impl.f_components_workbuf_heights[3])));
     self->private_impl.f_components_workbuf_offsets[0] = 0;
-    self->private_impl.f_components_workbuf_offsets[1] = (self->private_impl.f_components_workbuf_offsets[0] + (((uint64_t)(self->private_impl.f_components_workbuf_widths[0])) * ((uint64_t)(self->private_impl.f_components_workbuf_heights[0]))));
-    self->private_impl.f_components_workbuf_offsets[2] = (self->private_impl.f_components_workbuf_offsets[1] + (((uint64_t)(self->private_impl.f_components_workbuf_widths[1])) * ((uint64_t)(self->private_impl.f_components_workbuf_heights[1]))));
-    self->private_impl.f_components_workbuf_offsets[3] = (self->private_impl.f_components_workbuf_offsets[2] + (((uint64_t)(self->private_impl.f_components_workbuf_widths[2])) * ((uint64_t)(self->private_impl.f_components_workbuf_heights[2]))));
-    self->private_impl.f_components_workbuf_offsets[4] = (self->private_impl.f_components_workbuf_offsets[3] + (((uint64_t)(self->private_impl.f_components_workbuf_widths[3])) * ((uint64_t)(self->private_impl.f_components_workbuf_heights[3]))));
+    self->private_impl.f_components_workbuf_offsets[1] = (self->private_impl.f_components_workbuf_offsets[0] + v_wh0);
+    self->private_impl.f_components_workbuf_offsets[2] = (self->private_impl.f_components_workbuf_offsets[1] + v_wh1);
+    self->private_impl.f_components_workbuf_offsets[3] = (self->private_impl.f_components_workbuf_offsets[2] + v_wh2);
+    self->private_impl.f_components_workbuf_offsets[4] = (self->private_impl.f_components_workbuf_offsets[3] + v_wh3);
 
     goto ok;
     ok:
@@ -39614,55 +39623,92 @@
           goto exit;
         }
       }
-      if ( ! self->private_impl.f_seen_dht[(0 | self->private_impl.f_scan_comps_td[v_i])] ||  ! self->private_impl.f_seen_dht[(4 | self->private_impl.f_scan_comps_ta[v_i])]) {
+      v_i += 1;
+    }
+    if (self->private_impl.f_sof_marker < 194) {
+      self->private_data.s_prepare_scan[0].scratch = 3;
+      WUFFS_BASE__COROUTINE_SUSPENSION_POINT(4);
+      if (self->private_data.s_prepare_scan[0].scratch > ((uint64_t)(io2_a_src - iop_a_src))) {
+        self->private_data.s_prepare_scan[0].scratch -= ((uint64_t)(io2_a_src - iop_a_src));
+        iop_a_src = io2_a_src;
+        status = wuffs_base__make_status(wuffs_base__suspension__short_read);
+        goto suspend;
+      }
+      iop_a_src += self->private_data.s_prepare_scan[0].scratch;
+      self->private_impl.f_scan_ss = 0;
+      self->private_impl.f_scan_se = 63;
+      self->private_impl.f_scan_ah = 0;
+      self->private_impl.f_scan_al = 0;
+    } else {
+      {
+        WUFFS_BASE__COROUTINE_SUSPENSION_POINT(5);
+        if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) {
+          status = wuffs_base__make_status(wuffs_base__suspension__short_read);
+          goto suspend;
+        }
+        uint8_t t_3 = *iop_a_src++;
+        v_c = t_3;
+      }
+      if (v_c > 63) {
+        status = wuffs_base__make_status(wuffs_jpeg__error__bad_sos_marker);
+        goto exit;
+      }
+      self->private_impl.f_scan_ss = v_c;
+      {
+        WUFFS_BASE__COROUTINE_SUSPENSION_POINT(6);
+        if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) {
+          status = wuffs_base__make_status(wuffs_base__suspension__short_read);
+          goto suspend;
+        }
+        uint8_t t_4 = *iop_a_src++;
+        v_c = t_4;
+      }
+      if ((v_c > 63) || (v_c < self->private_impl.f_scan_ss)) {
+        status = wuffs_base__make_status(wuffs_jpeg__error__bad_sos_marker);
+        goto exit;
+      }
+      self->private_impl.f_scan_se = v_c;
+      {
+        WUFFS_BASE__COROUTINE_SUSPENSION_POINT(7);
+        if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) {
+          status = wuffs_base__make_status(wuffs_base__suspension__short_read);
+          goto suspend;
+        }
+        uint8_t t_5 = *iop_a_src++;
+        v_c = t_5;
+      }
+      if (((v_c >> 4) > 14) || ((v_c & 15) > 13)) {
+        status = wuffs_base__make_status(wuffs_jpeg__error__bad_sos_marker);
+        goto exit;
+      }
+      self->private_impl.f_scan_ah = (v_c >> 4);
+      self->private_impl.f_scan_al = (v_c & 15);
+      if (self->private_impl.f_scan_ah > 0) {
+        if ((self->private_impl.f_scan_ah - 1) != self->private_impl.f_scan_al) {
+          status = wuffs_base__make_status(wuffs_jpeg__error__bad_sos_marker);
+          goto exit;
+        }
+      }
+      if (self->private_impl.f_scan_ss == 0) {
+        if (self->private_impl.f_scan_se != 0) {
+          status = wuffs_base__make_status(wuffs_jpeg__error__bad_sos_marker);
+          goto exit;
+        }
+      } else {
+        if (self->private_impl.f_scan_num_components != 1) {
+          status = wuffs_base__make_status(wuffs_jpeg__error__bad_sos_marker);
+          goto exit;
+        }
+      }
+    }
+    v_i = 0;
+    while (v_i < self->private_impl.f_scan_num_components) {
+      if (((self->private_impl.f_scan_ss == 0) &&  ! self->private_impl.f_seen_dht[(0 | self->private_impl.f_scan_comps_td[v_i])]) || ((self->private_impl.f_scan_se != 0) &&  ! self->private_impl.f_seen_dht[(4 | self->private_impl.f_scan_comps_ta[v_i])])) {
         status = wuffs_base__make_status(wuffs_jpeg__error__missing_huffman_table);
         goto exit;
       }
       v_i += 1;
     }
-    {
-      WUFFS_BASE__COROUTINE_SUSPENSION_POINT(4);
-      if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) {
-        status = wuffs_base__make_status(wuffs_base__suspension__short_read);
-        goto suspend;
-      }
-      uint8_t t_3 = *iop_a_src++;
-      v_c = t_3;
-    }
-    if (v_c > 63) {
-      status = wuffs_base__make_status(wuffs_jpeg__error__bad_sos_marker);
-      goto exit;
-    }
-    self->private_impl.f_scan_ss = v_c;
-    {
-      WUFFS_BASE__COROUTINE_SUSPENSION_POINT(5);
-      if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) {
-        status = wuffs_base__make_status(wuffs_base__suspension__short_read);
-        goto suspend;
-      }
-      uint8_t t_4 = *iop_a_src++;
-      v_c = t_4;
-    }
-    if ((v_c > 63) || (v_c < self->private_impl.f_scan_ss)) {
-      status = wuffs_base__make_status(wuffs_jpeg__error__bad_sos_marker);
-      goto exit;
-    }
-    self->private_impl.f_scan_se = v_c;
-    {
-      WUFFS_BASE__COROUTINE_SUSPENSION_POINT(6);
-      if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) {
-        status = wuffs_base__make_status(wuffs_base__suspension__short_read);
-        goto suspend;
-      }
-      uint8_t t_5 = *iop_a_src++;
-      v_c = t_5;
-    }
-    if (((v_c >> 4) > 13) || ((v_c & 15) > 13)) {
-      status = wuffs_base__make_status(wuffs_jpeg__error__bad_sos_marker);
-      goto exit;
-    }
-    self->private_impl.f_scan_ah = (v_c >> 4);
-    self->private_impl.f_scan_al = (v_c & 15);
     self->private_impl.f_scan_width_in_mcus = self->private_impl.f_width_in_mcus;
     self->private_impl.f_scan_height_in_mcus = self->private_impl.f_height_in_mcus;
     if (self->private_impl.f_scan_num_components == 1) {
diff --git a/std/jpeg/decode_jpeg.wuffs b/std/jpeg/decode_jpeg.wuffs
index c5b5953..d776ad4 100644
--- a/std/jpeg/decode_jpeg.wuffs
+++ b/std/jpeg/decode_jpeg.wuffs
@@ -91,9 +91,65 @@
         scan_comps_td        : array[4] base.u8[..= 3],
         scan_comps_ta        : array[4] base.u8[..= 3],
 
+        // Every JPEG has a single SOI (Start Of Image) marker. Hierarchical
+        // JPEG images can have multiple Frames but this decoder does not
+        // support those. We thus expect a single SOF (Start Of Frame) for the
+        // single SOI.
+        //
+        // Sequential JPEGs have a single SOS (Start Of Scan) for the SOF.
+        // Their (Ss, Se, Ah, Al) below are effectively (0, 63, 0, 0).
+        //
+        // Progressive JPEGs have muliple Scans, incrementally building the 64
+        // coefficients (1 DC coefficient and 63 AC coefficients) of each
+        // pre-IDCT block. There are two independent aspects of progression.
+        //
+        // Spectral selection (the Ss and Se parameters) is when a subset of a
+        // 64 coefficients are in the scan. For example, three scans could hold
+        // coefficient 0 (the DC coefficient), coefficients 1 ..= 5 and then
+        // coefficients 6 ..= 63, in zig-zag order.
+        //
+        // Successive approximation (the Ah and Al parameters) is when a subset
+        // of the spectral-selected coefficients' 8 bits are in the scan
+        // (assuming 8-bit and not 12-bit or 16-bit JPEGs). For example, three
+        // scans could hold the 6 most significant bits, the second-least
+        // significant bit and then the least significant bit. A zero Ah value
+        // is a special case, meaning the scan holds (8 - Al) bits.
+        //
+        // The test/data/peacock.progressive.jpeg file has 10 scans:
+        // 1.   Ss = 0   Se =  0   Ah = 0   Al = 1   3 components (Y, Cb, Cr)
+        // 2.   Ss = 1   Se =  5   Ah = 0   Al = 2   1 component  (Y)
+        // 3.   Ss = 1   Se = 63   Ah = 0   Al = 1   1 component  (Cr)
+        // 4.   Ss = 1   Se = 63   Ah = 0   Al = 1   1 component  (Cb)
+        // 5.   Ss = 6   Se = 63   Ah = 0   Al = 2   1 component  (Y)
+        // 6.   Ss = 1   Se = 63   Ah = 2   Al = 1   1 component  (Y)
+        // 7.   Ss = 0   Se =  0   Ah = 1   Al = 0   3 components (Y, Cb, Cr)
+        // 8.   Ss = 1   Se = 63   Ah = 1   Al = 0   1 component  (Cr)
+        // 9.   Ss = 1   Se = 63   Ah = 1   Al = 0   1 component  (Cb)
+        // 10.  Ss = 1   Se = 63   Ah = 1   Al = 0   1 component  (Y)
+        //
+        // The first scan holds the high 7 bits of all components' DC
+        // coefficients.
+        //
+        // The second scan holds the high 6 bits of the 1 ..= 5 AC coefficients
+        // of the Y component.
+        //
+        // The third and fourth scan give the high 7 bits of all of the AC
+        // coefficients of the Cr and Cb components.
+        //
+        // The fifth scan is like the second scan, but for the Y component's
+        // remaining AC coefficients.
+        //
+        // The sixth scan holds the second-least significant bit for the Y
+        // component's AC coefficients.
+        //
+        // The seventh scan holds the least significant bit for all components'
+        // DC coefficients.
+        //
+        // The last three scans hold the least significant bit for components'
+        // AC coefficients.
         scan_ss : base.u8[..= 63],
         scan_se : base.u8[..= 63],
-        scan_ah : base.u8[..= 13],
+        scan_ah : base.u8[..= 14],
         scan_al : base.u8[..= 13],
 
         scan_width_in_mcus  : base.u32[..= 0x2000],
@@ -390,6 +446,11 @@
 
     var upper_bound : base.u32[..= 0x1_0008]
 
+    var wh0 : base.u64[..= 0x1_0010_0040]  // 0x1_0008 * 0x1_0008.
+    var wh1 : base.u64[..= 0x1_0010_0040]  // 0x1_0008 * 0x1_0008.
+    var wh2 : base.u64[..= 0x1_0010_0040]  // 0x1_0008 * 0x1_0008.
+    var wh3 : base.u64[..= 0x1_0010_0040]  // 0x1_0008 * 0x1_0008.
+
     if this.payload_length < 6 {
         return "#bad SOF marker"
     }
@@ -534,15 +595,16 @@
     this.components_workbuf_heights[3] = upper_bound.min(no_more_than:
             8 * this.height_in_mcus * (this.components_v[3] as base.u32))
 
+    wh0 = (this.components_workbuf_widths[0] as base.u64) * (this.components_workbuf_heights[0] as base.u64)
+    wh1 = (this.components_workbuf_widths[1] as base.u64) * (this.components_workbuf_heights[1] as base.u64)
+    wh2 = (this.components_workbuf_widths[2] as base.u64) * (this.components_workbuf_heights[2] as base.u64)
+    wh3 = (this.components_workbuf_widths[3] as base.u64) * (this.components_workbuf_heights[3] as base.u64)
+
     this.components_workbuf_offsets[0] = 0
-    this.components_workbuf_offsets[1] = this.components_workbuf_offsets[0] +
-            ((this.components_workbuf_widths[0] as base.u64) * (this.components_workbuf_heights[0] as base.u64))
-    this.components_workbuf_offsets[2] = this.components_workbuf_offsets[1] +
-            ((this.components_workbuf_widths[1] as base.u64) * (this.components_workbuf_heights[1] as base.u64))
-    this.components_workbuf_offsets[3] = this.components_workbuf_offsets[2] +
-            ((this.components_workbuf_widths[2] as base.u64) * (this.components_workbuf_heights[2] as base.u64))
-    this.components_workbuf_offsets[4] = this.components_workbuf_offsets[3] +
-            ((this.components_workbuf_widths[3] as base.u64) * (this.components_workbuf_heights[3] as base.u64))
+    this.components_workbuf_offsets[1] = this.components_workbuf_offsets[0] + wh0
+    this.components_workbuf_offsets[2] = this.components_workbuf_offsets[1] + wh1
+    this.components_workbuf_offsets[3] = this.components_workbuf_offsets[2] + wh2
+    this.components_workbuf_offsets[4] = this.components_workbuf_offsets[3] + wh3
 }
 
 pub func decoder.decode_frame_config?(dst: nptr base.frame_config, src: base.io_reader) {
@@ -1147,8 +1209,62 @@
             }
         }
 
-        if (not this.seen_dht[0 | this.scan_comps_td[i]]) or
-                (not this.seen_dht[4 | this.scan_comps_ta[i]]) {
+        i += 1
+    } endwhile
+
+    if this.sof_marker < 0xC2 {
+        // At this point, for sequential (not progressive) JPEGs, we should
+        // check that (Ss, Se, Ah, Al) is (0, 63, 0, 0) but libjpeg treats
+        // otherwise as a warning (JWRN_NOT_SEQUENTIAL), not an error.
+        args.src.skip_u32?(n: 3)
+        this.scan_ss = 0
+        this.scan_se = 63
+        this.scan_ah = 0
+        this.scan_al = 0
+
+    } else {
+        c = args.src.read_u8?()
+        if c > 63 {
+            return "#bad SOS marker"
+        }
+        this.scan_ss = c
+
+        c = args.src.read_u8?()
+        if (c > 63) or (c < this.scan_ss) {
+            return "#bad SOS marker"
+        }
+        this.scan_se = c
+
+        c = args.src.read_u8?()
+        if ((c >> 4) > 14) or ((c & 0x0F) > 13) {
+            return "#bad SOS marker"
+        }
+        this.scan_ah = c >> 4
+        this.scan_al = c & 0x0F
+        if this.scan_ah > 0 {
+            if (this.scan_ah - 1) <> this.scan_al {
+                return "#bad SOS marker"
+            }
+        }
+
+        if this.scan_ss == 0 {
+            // Progressive DC scans can have only one spectral element.
+            if this.scan_se <> 0 {
+                return "#bad SOS marker"
+            }
+        } else {
+            // Progressive AC scans can have only one component.
+            if this.scan_num_components <> 1 {
+                return "#bad SOS marker"
+            }
+        }
+    }
+
+    i = 0
+    while i < this.scan_num_components {
+        assert i < 4 via "a < b: a < c; c <= b"(c: this.scan_num_components)
+        if ((this.scan_ss == 0) and (not this.seen_dht[0 | this.scan_comps_td[i]])) or
+                ((this.scan_se <> 0) and (not this.seen_dht[4 | this.scan_comps_ta[i]])) {
             // TODO: while not required by the spec, we could fall back to
             // implicit tables? See section K.3 "Typical Huffman tables for
             // 8-bit precision luminance and chrominance" and
@@ -1159,29 +1275,6 @@
         i += 1
     } endwhile
 
-    c = args.src.read_u8?()
-    if c > 63 {
-        return "#bad SOS marker"
-    }
-    this.scan_ss = c
-
-    c = args.src.read_u8?()
-    if (c > 63) or (c < this.scan_ss) {
-        return "#bad SOS marker"
-    }
-    this.scan_se = c
-
-    c = args.src.read_u8?()
-    if ((c >> 4) > 13) or ((c & 0x0F) > 13) {
-        return "#bad SOS marker"
-    }
-    this.scan_ah = c >> 4
-    this.scan_al = c & 0x0F
-
-    // At this point, for sequential (not progressive) JPEGs, we should check
-    // that (Ss, Se, Ah, Al) is (0, 63, 0, 0) but libjpeg treats otherwise as a
-    // warning (JWRN_NOT_SEQUENTIAL), not an error.
-
     this.scan_width_in_mcus = this.width_in_mcus
     this.scan_height_in_mcus = this.height_in_mcus
 
diff --git a/test/nia-checksums-of-data.txt b/test/nia-checksums-of-data.txt
index 793d25b..ea00b59 100644
--- a/test/nia-checksums-of-data.txt
+++ b/test/nia-checksums-of-data.txt
@@ -85,7 +85,7 @@
 d49bab2d test/data/peacock.grayscale.jpeg
 d42a13cd test/data/peacock.optimize.jpeg
 bdf8cc70 test/data/peacock.ppm
-9154cd54 test/data/peacock.progressive.jpeg
+db9b1efa test/data/peacock.progressive.jpeg
 951676cc test/data/peacock.q30.jpeg
 56835223 test/data/peacock.q99.jpeg
 1543f677 test/data/peacock.s-weird.jpeg