std/lzma: simplify decode-a-literal

name                            old speed      new speed      delta
wuffs_lzma_decode_100k/clang14  54.8MB/s ± 1%  56.2MB/s ± 0%  +2.61%  (p=0.008 n=5+5)
wuffs_lzma_decode_100k/gcc12    56.8MB/s ± 0%  56.4MB/s ± 1%  -0.75%  (p=0.008 n=5+5)

$ time example-mzcat < linux-6.8.2.tar.xz > /dev/null
Before: user 0m8.409s
After:  user 0m8.396s
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index b864346..5ae002d 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -52123,67 +52123,40 @@
         v_range <<= 8u;
       }
       v_index_lit = (15u & ((((uint32_t)((v_pos & v_lp_mask))) << v_lc) | (((uint32_t)(v_prev_byte)) >> (8u - v_lc))));
+      v_lanl_offset = 0u;
       if (v_state >= 7u) {
         v_lanl_offset = 256u;
-        v_tree_node = 1u;
-        while (v_tree_node < 256u) {
-          v_match_byte <<= 1u;
-          v_lanl_old_offset = v_lanl_offset;
-          v_lanl_offset &= v_match_byte;
-          v_lanl_index = (v_lanl_offset + v_lanl_old_offset + v_tree_node);
-          v_prob = ((uint32_t)(self->private_data.f_probs_lit[v_index_lit][v_lanl_index]));
-          v_threshold = ((uint32_t)((v_range >> 11u) * v_prob));
-          if (v_bits < v_threshold) {
-            v_lanl_offset = ((v_lanl_offset ^ v_lanl_old_offset) & 256u);
-            v_range = v_threshold;
-            v_prob += (((uint32_t)(2048u - v_prob)) >> 5u);
-            self->private_data.f_probs_lit[v_index_lit][v_lanl_index] = ((uint16_t)(v_prob));
-            v_tree_node = (v_tree_node << 1u);
-          } else {
-            v_bits -= v_threshold;
-            v_range -= v_threshold;
-            v_prob -= (v_prob >> 5u);
-            self->private_data.f_probs_lit[v_index_lit][v_lanl_index] = ((uint16_t)(v_prob));
-            v_tree_node = ((v_tree_node << 1u) | 1u);
-          }
-          if ((v_range >> 24u) == 0u) {
-            if (((uint64_t)(io2_a_src - iop_a_src)) <= 0u) {
-              status = wuffs_base__make_status(wuffs_lzma__error__internal_error_inconsistent_i_o);
-              goto exit;
-            }
-            v_c8 = wuffs_base__peek_u8be__no_bounds_check(iop_a_src);
-            iop_a_src += 1u;
-            v_bits = (((uint32_t)(v_bits << 8u)) | ((uint32_t)(v_c8)));
-            v_range <<= 8u;
-          }
+      }
+      v_tree_node = 1u;
+      while (v_tree_node < 256u) {
+        v_match_byte <<= 1u;
+        v_lanl_old_offset = v_lanl_offset;
+        v_lanl_offset &= v_match_byte;
+        v_lanl_index = (v_lanl_offset + v_lanl_old_offset + v_tree_node);
+        v_prob = ((uint32_t)(self->private_data.f_probs_lit[v_index_lit][v_lanl_index]));
+        v_threshold = ((uint32_t)((v_range >> 11u) * v_prob));
+        if (v_bits < v_threshold) {
+          v_lanl_offset = ((v_lanl_offset ^ v_lanl_old_offset) & 256u);
+          v_range = v_threshold;
+          v_prob += (((uint32_t)(2048u - v_prob)) >> 5u);
+          self->private_data.f_probs_lit[v_index_lit][v_lanl_index] = ((uint16_t)(v_prob));
+          v_tree_node = (v_tree_node << 1u);
+        } else {
+          v_bits -= v_threshold;
+          v_range -= v_threshold;
+          v_prob -= (v_prob >> 5u);
+          self->private_data.f_probs_lit[v_index_lit][v_lanl_index] = ((uint16_t)(v_prob));
+          v_tree_node = ((v_tree_node << 1u) | 1u);
         }
-      } else {
-        v_tree_node = 1u;
-        while (v_tree_node < 256u) {
-          v_prob = ((uint32_t)(self->private_data.f_probs_lit[v_index_lit][v_tree_node]));
-          v_threshold = ((uint32_t)((v_range >> 11u) * v_prob));
-          if (v_bits < v_threshold) {
-            v_range = v_threshold;
-            v_prob += (((uint32_t)(2048u - v_prob)) >> 5u);
-            self->private_data.f_probs_lit[v_index_lit][v_tree_node] = ((uint16_t)(v_prob));
-            v_tree_node = (v_tree_node << 1u);
-          } else {
-            v_bits -= v_threshold;
-            v_range -= v_threshold;
-            v_prob -= (v_prob >> 5u);
-            self->private_data.f_probs_lit[v_index_lit][v_tree_node] = ((uint16_t)(v_prob));
-            v_tree_node = ((v_tree_node << 1u) | 1u);
+        if ((v_range >> 24u) == 0u) {
+          if (((uint64_t)(io2_a_src - iop_a_src)) <= 0u) {
+            status = wuffs_base__make_status(wuffs_lzma__error__internal_error_inconsistent_i_o);
+            goto exit;
           }
-          if ((v_range >> 24u) == 0u) {
-            if (((uint64_t)(io2_a_src - iop_a_src)) <= 0u) {
-              status = wuffs_base__make_status(wuffs_lzma__error__internal_error_inconsistent_i_o);
-              goto exit;
-            }
-            v_c8 = wuffs_base__peek_u8be__no_bounds_check(iop_a_src);
-            iop_a_src += 1u;
-            v_bits = (((uint32_t)(v_bits << 8u)) | ((uint32_t)(v_c8)));
-            v_range <<= 8u;
-          }
+          v_c8 = wuffs_base__peek_u8be__no_bounds_check(iop_a_src);
+          iop_a_src += 1u;
+          v_bits = (((uint32_t)(v_bits << 8u)) | ((uint32_t)(v_c8)));
+          v_range <<= 8u;
         }
       }
       v_prev_byte = ((uint8_t)(v_tree_node));
diff --git a/std/lzma/decode_bitstream_fast.wuffs b/std/lzma/decode_bitstream_fast.wuffs
index 05053da..5f2d213 100644
--- a/std/lzma/decode_bitstream_fast.wuffs
+++ b/std/lzma/decode_bitstream_fast.wuffs
@@ -130,102 +130,46 @@
                     (((pos & lp_mask) as base.u32) << lc) |
                     ((prev_byte as base.u32) >> (8 - lc)))
 
+            lanl_offset = 0
             if state >= 7 {
-                // We are decoding an 8-bit LITERAL, and (state >= 7) means
-                // that we most recently decoded a NON-LITERAL. The
-                // this.probs_lit[index_lit] array has 0x300 elements: three
-                // banks of 0x100.
-                //
-                // For (state < 7), we only use the first bank (and we run the
-                // while.low_state loop instead). This is equivalent to running
-                // the while.high_state loop where lanl_offset starts at and
-                // stays at zero for the entire loop. match_byte is ignored.
-                //
-                // For (state >= 7), we start by using the two later banks, but
-                // as we are decoding those 8 bits (from MSB to LSB order), we
-                // drop down to the first bank (and stay there) if and when the
-                // decoded bit doesn't equal the corresponding bit (in the same
-                // MSB to LSB order) in the match_byte variable.
-                //
-                // lanl_offset stays at 0x100 until that "if and when" point
-                // (which typically happens; if all 8 bits were equal then the
-                // previous NON-LITERAL copy could often just have been
-                // longer), when lanl_offset then drops down to zero. We then
-                // effectively run the remaining (8 - I) iterations in the
-                // while.low_state loop, where I is the iteration count.
-                //
-                // Whether we use the 0x100 or 0x200 bank, out of the two later
-                // banks, depends on the match_byte's I'th bit (MSB to LSB).
-                // match_byte is conceptually a base.u8 (it is <= 0xFF outside
-                // of this code fragment) but the intermediate computation is
-                // simpler as a base.u32 (ignoring the high 23 bits).
                 lanl_offset = 0x100
-
-                tree_node = 1
-                while.high_state tree_node < 0x100,
-                        inv args.dst.length() >= 282,
-                {
-                    match_byte ~mod<<= 1
-                    lanl_old_offset = lanl_offset
-                    lanl_offset &= match_byte
-                    lanl_index = lanl_offset + lanl_old_offset + tree_node
-
-                    prob = this.probs_lit[index_lit][lanl_index] as base.u32
-                    threshold = (range >> 11) ~mod* prob
-                    if bits < threshold {
-                        lanl_offset = (lanl_offset ^ lanl_old_offset) & 0x100
-                        range = threshold
-                        prob ~mod+= (2048 ~mod- prob) >> 5
-                        this.probs_lit[index_lit][lanl_index] = (prob & 0xFFFF) as base.u16
-                        tree_node = (tree_node << 1)
-                    } else {
-                        bits ~mod-= threshold
-                        range ~mod-= threshold
-                        prob ~mod-= prob >> 5
-                        this.probs_lit[index_lit][lanl_index] = (prob & 0xFFFF) as base.u16
-                        tree_node = (tree_node << 1) | 1
-                    }
-                    if (range >> 24) == 0 {
-                        if args.src.length() <= 0 {
-                            return "#internal error: inconsistent I/O"
-                        }
-                        c8 = args.src.peek_u8()
-                        args.src.skip_u32_fast!(actual: 1, worst_case: 1)
-                        bits = (bits ~mod<< 8) | (c8 as base.u32)
-                        range ~mod<<= 8
-                    }
-                } endwhile.high_state
-
-            } else {
-                tree_node = 1
-                while.low_state tree_node < 0x100,
-                        inv args.dst.length() >= 282,
-                {
-                    prob = this.probs_lit[index_lit][tree_node] as base.u32
-                    threshold = (range >> 11) ~mod* prob
-                    if bits < threshold {
-                        range = threshold
-                        prob ~mod+= (2048 ~mod- prob) >> 5
-                        this.probs_lit[index_lit][tree_node] = (prob & 0xFFFF) as base.u16
-                        tree_node = (tree_node << 1)
-                    } else {
-                        bits ~mod-= threshold
-                        range ~mod-= threshold
-                        prob ~mod-= prob >> 5
-                        this.probs_lit[index_lit][tree_node] = (prob & 0xFFFF) as base.u16
-                        tree_node = (tree_node << 1) | 1
-                    }
-                    if (range >> 24) == 0 {
-                        if args.src.length() <= 0 {
-                            return "#internal error: inconsistent I/O"
-                        }
-                        c8 = args.src.peek_u8()
-                        args.src.skip_u32_fast!(actual: 1, worst_case: 1)
-                        bits = (bits ~mod<< 8) | (c8 as base.u32)
-                        range ~mod<<= 8
-                    }
-                } endwhile.low_state
             }
+
+            tree_node = 1
+            while tree_node < 0x100,
+                    inv args.dst.length() >= 282,
+            {
+                match_byte ~mod<<= 1
+                lanl_old_offset = lanl_offset
+                lanl_offset &= match_byte
+                lanl_index = lanl_offset + lanl_old_offset + tree_node
+
+                prob = this.probs_lit[index_lit][lanl_index] as base.u32
+                threshold = (range >> 11) ~mod* prob
+                if bits < threshold {
+                    lanl_offset = (lanl_offset ^ lanl_old_offset) & 0x100
+                    range = threshold
+                    prob ~mod+= (2048 ~mod- prob) >> 5
+                    this.probs_lit[index_lit][lanl_index] = (prob & 0xFFFF) as base.u16
+                    tree_node = (tree_node << 1)
+                } else {
+                    bits ~mod-= threshold
+                    range ~mod-= threshold
+                    prob ~mod-= prob >> 5
+                    this.probs_lit[index_lit][lanl_index] = (prob & 0xFFFF) as base.u16
+                    tree_node = (tree_node << 1) | 1
+                }
+                if (range >> 24) == 0 {
+                    if args.src.length() <= 0 {
+                        return "#internal error: inconsistent I/O"
+                    }
+                    c8 = args.src.peek_u8()
+                    args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+                    bits = (bits ~mod<< 8) | (c8 as base.u32)
+                    range ~mod<<= 8
+                }
+            } endwhile
+
             assert args.dst.length() >= 282
 
             // AlgOve03  emitLiteral(literal)