floatconv: tweak Simple Decimal Conversion

This tweaks the code and comments added in commit 36d7748e "Merge pull
request #90 from sarastro-nl/improvement/optimize-SDC" and commit
d46220ca "optimize SDC". Both of those two commit messages are short but
see the discussion re algorithmic changes on PR 87 (yes, 87, not 90):
https://github.com/google/wuffs/pull/87

Also run clang-format.

Also update doc/changelog.md

Compared to before commit d46220ca "optimize SDC", when running
script/manual-test-parse-number-f64.cc and disabling both the
etc__parse_number_f64_eisel_lemire function and the "If both man and
(10 ** exp10) are exactly representable by a double" fast path, the
tests still pass but the number of etc__high_prec_dec__small_lshift and
etc__high_prec_dec__small_rshift calls drop:

Function    Before      After       Ratio
lshift      31174584    25762258    82.64% = 100% - 17.36%
rshift      21592604    20982502    97.17% = 100% -  2.83%

The time taken drops from 15.984s to 13.710s (85.77% = 100% - 14.23%).

To repeat, these are the improvements after disabling the Eisel-Lemire
and other fast paths. Simple Decimal Conversion is the rarely invoked
fallback parse_number_f64 algorithm. The practical improvement (with
Eisel-Lemire etc enabled) is smaller.
diff --git a/doc/changelog.md b/doc/changelog.md
index fbb05c2..462fd06 100644
--- a/doc/changelog.md
+++ b/doc/changelog.md
@@ -7,6 +7,9 @@
 input"` instead of `"$short read"`. Importantly, this is an error, not a
 suspension.
 
+The `wuffs_base__parse_number_f64` function's Simple Decimal Conversion
+fallback algorithm has been optimized.
+
 
 ## 2023-01-26 version 0.3.0
 
diff --git a/internal/cgen/base/floatconv-submodule-code.c b/internal/cgen/base/floatconv-submodule-code.c
index 33fc546..7691182 100644
--- a/internal/cgen/base/floatconv-submodule-code.c
+++ b/internal/cgen/base/floatconv-submodule-code.c
@@ -1264,6 +1264,27 @@
     // powers converts decimal powers of 10 to binary powers of 2. For example,
     // (10000 >> 13) is 1. It stops before the elements exceed 60, also known
     // as WUFFS_BASE__PRIVATE_IMPLEMENTATION__HPD__SHIFT__MAX_INCL.
+    //
+    // This rounds down (1<<13 is a lower bound for 1e4). Adding 1 to the array
+    // element value rounds up (1<<14 is an upper bound for 1e4) while staying
+    // at or below WUFFS_BASE__PRIVATE_IMPLEMENTATION__HPD__SHIFT__MAX_INCL.
+    //
+    // When starting in the range [1e+1 .. 1e+2] (i.e. h->decimal_point == +2),
+    // powers[2] == 6 and so:
+    //  - Right shifting by 6+0 produces the range [10/64 .. 100/64] =
+    //    [0.156250 .. 1.56250]. The resultant h->decimal_point is +0 or +1.
+    //  - Right shifting by 6+1 produces the range [10/128 .. 100/128] =
+    //    [0.078125 .. 0.78125]. The resultant h->decimal_point is -1 or -0.
+    //
+    // When starting in the range [1e-3 .. 1e-2] (i.e. h->decimal_point == -2),
+    // powers[2] == 6 and so:
+    //  - Left shifting by 6+0 produces the range [0.001*64 .. 0.01*64] =
+    //    [0.064 .. 0.64]. The resultant h->decimal_point is -1 or -0.
+    //  - Left shifting by 6+1 produces the range [0.001*128 .. 0.01*128] =
+    //    [0.128 .. 1.28]. The resultant h->decimal_point is +0 or +1.
+    //
+    // Thus, when targeting h->decimal_point being +0 or +1, use (powers[n]+0)
+    // when right shifting but (powers[n]+1) when left shifting.
     static const uint32_t num_powers = 19;
     static const uint8_t powers[19] = {
         0,  3,  6,  9,  13, 16, 19, 23, 26, 29,  //
@@ -1307,9 +1328,10 @@
     // When Eisel-Lemire fails, fall back to Simple Decimal Conversion. See
     // https://nigeltao.github.io/blog/2020/parse-number-f64-simple.html
     //
-    // Scale by powers of 2 until we're in the range [1 .. 2]. First we shift
-    // right, possibly a little too far, ending with a value certainly below
-    // 10 ...
+    // Scale by powers of 2 until we're in the range [0.1 .. 10]. Equivalently,
+    // that h->decimal_point is +0 or +1.
+    //
+    // First we shift right while at or above 10...
     const int32_t f64_bias = -1023;
     int32_t exp2 = 0;
     while (h->decimal_point > 1) {
@@ -1326,13 +1348,15 @@
       }
       exp2 += (int32_t)shift;
     }
-    // ... then we shift left, putting us in [0.1 .. 10].
+    // ...then we shift left while below 0.1.
     while (h->decimal_point < 0) {
       uint32_t shift;
-        uint32_t n = (uint32_t)(-h->decimal_point);
-        shift = (n < num_powers)
-                    ? powers[n] + 1 // shift 1 extra since we're now multiplying
-                    : WUFFS_BASE__PRIVATE_IMPLEMENTATION__HPD__SHIFT__MAX_INCL;
+      uint32_t n = (uint32_t)(-h->decimal_point);
+      shift = (n < num_powers)
+                  // The +1 is per "when targeting h->decimal_point being +0 or
+                  // +1... when left shifting" in the powers comment above.
+                  ? (powers[n] + 1)
+                  : WUFFS_BASE__PRIVATE_IMPLEMENTATION__HPD__SHIFT__MAX_INCL;
 
       wuffs_base__private_implementation__high_prec_dec__small_lshift(h, shift);
       if (h->decimal_point >
@@ -1342,40 +1366,38 @@
       exp2 -= (int32_t)shift;
     }
 
-    // To get into the range [1 .. 2] we need to look at the first 3 digits of
-    // the mantisse to determine the final left shift. If we are already between 1
-    // and 2 then just shift left by 52.
-    uint64_t man = 0;
-    uint32_t i;
-    for (i = 0; i < 3; i++) {
-        man = (10 * man) + ((i < h->num_digits) ? h->digits[i] : 0);
+    // To get from "in the range [0.1 .. 10]" to "in the range [1 .. 2]" (which
+    // will give us our exponent in base-2), the mantissa's first 3 digits will
+    // determine the final left shift, equal to 52 (the number of explicit f64
+    // bits) plus an additional adjustment.
+    int man3 = (100 * h->digits[0]) +
+               ((h->num_digits > 1) ? (10 * h->digits[1]) : 0) +
+               ((h->num_digits > 2) ? h->digits[2] : 0);
+    int32_t additional_lshift = 0;
+    if (h->decimal_point == 0) {  // The value is in [0.1 .. 1].
+      if (man3 < 125) {
+        additional_lshift = +4;
+      } else if (man3 < 250) {
+        additional_lshift = +3;
+      } else if (man3 < 500) {
+        additional_lshift = +2;
+      } else {
+        additional_lshift = +1;
+      }
+    } else {  // The value is in [1 .. 10].
+      if (man3 < 200) {
+        additional_lshift = -0;
+      } else if (man3 < 400) {
+        additional_lshift = -1;
+      } else if (man3 < 800) {
+        additional_lshift = -2;
+      } else {
+        additional_lshift = -3;
+      }
     }
-    int32_t final_lshift = 52;
-    int32_t additional_shift = 0;
-    if (h->decimal_point == 0) { // value is in [0.1 .. 1]
-        if (man < 125) {
-            additional_shift = -4;
-        } else if (man < 250) {
-            additional_shift = -3;
-        } else if (man < 500) {
-            additional_shift = -2;
-        } else {
-            additional_shift = -1;
-        }
-    } else { // value is in [1 .. 10]
-        if (man < 200) {
-            additional_shift = 0;
-        } else if (man < 400) {
-            additional_shift = 1;
-        } else if (man < 800) {
-            additional_shift = 2;
-        } else {
-            additional_shift = 3;
-        }
-    }
-    exp2 += additional_shift;
-    final_lshift -= additional_shift;
-    
+    exp2 -= additional_lshift;
+    uint32_t final_lshift = (uint32_t)(52 + additional_lshift);
+
     // The minimum normal exponent is (f64_bias + 1).
     while ((f64_bias + 1) > exp2) {
       uint32_t n = (uint32_t)((f64_bias + 1) - exp2);
@@ -1392,7 +1414,8 @@
     }
 
     // Extract 53 bits for the mantissa (in base-2).
-    wuffs_base__private_implementation__high_prec_dec__small_lshift(h, final_lshift);
+    wuffs_base__private_implementation__high_prec_dec__small_lshift(
+        h, final_lshift);
     uint64_t man2 =
         wuffs_base__private_implementation__high_prec_dec__rounded_integer(h);
 
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 828d6e5..400895e 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -14781,6 +14781,27 @@
     // powers converts decimal powers of 10 to binary powers of 2. For example,
     // (10000 >> 13) is 1. It stops before the elements exceed 60, also known
     // as WUFFS_BASE__PRIVATE_IMPLEMENTATION__HPD__SHIFT__MAX_INCL.
+    //
+    // This rounds down (1<<13 is a lower bound for 1e4). Adding 1 to the array
+    // element value rounds up (1<<14 is an upper bound for 1e4) while staying
+    // at or below WUFFS_BASE__PRIVATE_IMPLEMENTATION__HPD__SHIFT__MAX_INCL.
+    //
+    // When starting in the range [1e+1 .. 1e+2] (i.e. h->decimal_point == +2),
+    // powers[2] == 6 and so:
+    //  - Right shifting by 6+0 produces the range [10/64 .. 100/64] =
+    //    [0.156250 .. 1.56250]. The resultant h->decimal_point is +0 or +1.
+    //  - Right shifting by 6+1 produces the range [10/128 .. 100/128] =
+    //    [0.078125 .. 0.78125]. The resultant h->decimal_point is -1 or -0.
+    //
+    // When starting in the range [1e-3 .. 1e-2] (i.e. h->decimal_point == -2),
+    // powers[2] == 6 and so:
+    //  - Left shifting by 6+0 produces the range [0.001*64 .. 0.01*64] =
+    //    [0.064 .. 0.64]. The resultant h->decimal_point is -1 or -0.
+    //  - Left shifting by 6+1 produces the range [0.001*128 .. 0.01*128] =
+    //    [0.128 .. 1.28]. The resultant h->decimal_point is +0 or +1.
+    //
+    // Thus, when targeting h->decimal_point being +0 or +1, use (powers[n]+0)
+    // when right shifting but (powers[n]+1) when left shifting.
     static const uint32_t num_powers = 19;
     static const uint8_t powers[19] = {
         0,  3,  6,  9,  13, 16, 19, 23, 26, 29,  //
@@ -14824,9 +14845,10 @@
     // When Eisel-Lemire fails, fall back to Simple Decimal Conversion. See
     // https://nigeltao.github.io/blog/2020/parse-number-f64-simple.html
     //
-    // Scale by powers of 2 until we're in the range [1 .. 2]. First we shift
-    // right, possibly a little too far, ending with a value certainly below
-    // 10 ...
+    // Scale by powers of 2 until we're in the range [0.1 .. 10]. Equivalently,
+    // that h->decimal_point is +0 or +1.
+    //
+    // First we shift right while at or above 10...
     const int32_t f64_bias = -1023;
     int32_t exp2 = 0;
     while (h->decimal_point > 1) {
@@ -14843,13 +14865,15 @@
       }
       exp2 += (int32_t)shift;
     }
-    // ... then we shift left, putting us in [0.1 .. 10].
+    // ...then we shift left while below 0.1.
     while (h->decimal_point < 0) {
       uint32_t shift;
-        uint32_t n = (uint32_t)(-h->decimal_point);
-        shift = (n < num_powers)
-                    ? powers[n] + 1 // shift 1 extra since we're now multiplying
-                    : WUFFS_BASE__PRIVATE_IMPLEMENTATION__HPD__SHIFT__MAX_INCL;
+      uint32_t n = (uint32_t)(-h->decimal_point);
+      shift = (n < num_powers)
+                  // The +1 is per "when targeting h->decimal_point being +0 or
+                  // +1... when left shifting" in the powers comment above.
+                  ? (powers[n] + 1)
+                  : WUFFS_BASE__PRIVATE_IMPLEMENTATION__HPD__SHIFT__MAX_INCL;
 
       wuffs_base__private_implementation__high_prec_dec__small_lshift(h, shift);
       if (h->decimal_point >
@@ -14859,40 +14883,38 @@
       exp2 -= (int32_t)shift;
     }
 
-    // To get into the range [1 .. 2] we need to look at the first 3 digits of
-    // the mantisse to determine the final left shift. If we are already between 1
-    // and 2 then just shift left by 52.
-    uint64_t man = 0;
-    uint32_t i;
-    for (i = 0; i < 3; i++) {
-        man = (10 * man) + ((i < h->num_digits) ? h->digits[i] : 0);
+    // To get from "in the range [0.1 .. 10]" to "in the range [1 .. 2]" (which
+    // will give us our exponent in base-2), the mantissa's first 3 digits will
+    // determine the final left shift, equal to 52 (the number of explicit f64
+    // bits) plus an additional adjustment.
+    int man3 = (100 * h->digits[0]) +
+               ((h->num_digits > 1) ? (10 * h->digits[1]) : 0) +
+               ((h->num_digits > 2) ? h->digits[2] : 0);
+    int32_t additional_lshift = 0;
+    if (h->decimal_point == 0) {  // The value is in [0.1 .. 1].
+      if (man3 < 125) {
+        additional_lshift = +4;
+      } else if (man3 < 250) {
+        additional_lshift = +3;
+      } else if (man3 < 500) {
+        additional_lshift = +2;
+      } else {
+        additional_lshift = +1;
+      }
+    } else {  // The value is in [1 .. 10].
+      if (man3 < 200) {
+        additional_lshift = -0;
+      } else if (man3 < 400) {
+        additional_lshift = -1;
+      } else if (man3 < 800) {
+        additional_lshift = -2;
+      } else {
+        additional_lshift = -3;
+      }
     }
-    int32_t final_lshift = 52;
-    int32_t additional_shift = 0;
-    if (h->decimal_point == 0) { // value is in [0.1 .. 1]
-        if (man < 125) {
-            additional_shift = -4;
-        } else if (man < 250) {
-            additional_shift = -3;
-        } else if (man < 500) {
-            additional_shift = -2;
-        } else {
-            additional_shift = -1;
-        }
-    } else { // value is in [1 .. 10]
-        if (man < 200) {
-            additional_shift = 0;
-        } else if (man < 400) {
-            additional_shift = 1;
-        } else if (man < 800) {
-            additional_shift = 2;
-        } else {
-            additional_shift = 3;
-        }
-    }
-    exp2 += additional_shift;
-    final_lshift -= additional_shift;
-    
+    exp2 -= additional_lshift;
+    uint32_t final_lshift = (uint32_t)(52 + additional_lshift);
+
     // The minimum normal exponent is (f64_bias + 1).
     while ((f64_bias + 1) > exp2) {
       uint32_t n = (uint32_t)((f64_bias + 1) - exp2);
@@ -14909,7 +14931,8 @@
     }
 
     // Extract 53 bits for the mantissa (in base-2).
-    wuffs_base__private_implementation__high_prec_dec__small_lshift(h, final_lshift);
+    wuffs_base__private_implementation__high_prec_dec__small_lshift(
+        h, final_lshift);
     uint64_t man2 =
         wuffs_base__private_implementation__high_prec_dec__rounded_integer(h);