Make Brotli decompression faster

Makes it ~8% faster on my skylake desktop.

PiperOrigin-RevId: 689499172
diff --git a/c/dec/decode.c b/c/dec/decode.c
index 3ed9fe8..921aa4a 100644
--- a/c/dec/decode.c
+++ b/c/dec/decode.c
@@ -2006,35 +2006,72 @@
     brotli_reg_t bits;
     brotli_reg_t value;
     PreloadSymbol(safe, s->literal_htree, br, &bits, &value);
-    do {
-      if (!CheckInputAmount(safe, br)) {
-        s->state = BROTLI_STATE_COMMAND_INNER;
-        result = BROTLI_DECODER_NEEDS_MORE_INPUT;
-        goto saveStateAndReturn;
+    if (!safe) {
+      // This is a hottest part of the decode, so we copy the loop below
+      // and optimize it by calculating the number of steps where all checks
+      // evaluate to false (ringbuffer size/block size/input size).
+      // Since all checks are loop invariant, we just need to find
+      // minimal number of iterations for a simple loop, and run
+      // the full version for the remainder.
+      int num_steps = i - 1;
+      if (num_steps > 0 && ((brotli_reg_t)(num_steps) > s->block_length[0])) {
+        // Safe cast, since block_length < steps
+        num_steps = (int)s->block_length[0];
       }
-      if (BROTLI_PREDICT_FALSE(s->block_length[0] == 0)) {
-        goto NextLiteralBlock;
+      if (s->ringbuffer_size >= pos &&
+          (s->ringbuffer_size - pos) <= num_steps) {
+        num_steps = s->ringbuffer_size - pos - 1;
       }
-      if (!safe) {
+      if (num_steps < 0) {
+        num_steps = 0;
+      }
+      num_steps = BrotliCopyPreloadedSymbolsToU8(s->literal_htree, br, &bits,
+                                                 &value, s->ringbuffer, pos,
+                                                 num_steps);
+      pos += num_steps;
+      s->block_length[0] -= (brotli_reg_t)num_steps;
+      i -= num_steps;
+      do {
+        if (!CheckInputAmount(safe, br)) {
+          s->state = BROTLI_STATE_COMMAND_INNER;
+          result = BROTLI_DECODER_NEEDS_MORE_INPUT;
+          goto saveStateAndReturn;
+        }
+        if (BROTLI_PREDICT_FALSE(s->block_length[0] == 0)) {
+          goto NextLiteralBlock;
+        }
         BrotliCopyPreloadedSymbolsToU8(s->literal_htree, br, &bits, &value,
                                        s->ringbuffer, pos, 1);
-      } else {
+        --s->block_length[0];
+        BROTLI_LOG_ARRAY_INDEX(s->ringbuffer, pos);
+        ++pos;
+        if (BROTLI_PREDICT_FALSE(pos == s->ringbuffer_size)) {
+          s->state = BROTLI_STATE_COMMAND_INNER_WRITE;
+          --i;
+          goto saveStateAndReturn;
+        }
+      } while (--i != 0);
+    } else { /* safe */
+      do {
+        if (BROTLI_PREDICT_FALSE(s->block_length[0] == 0)) {
+          goto NextLiteralBlock;
+        }
         brotli_reg_t literal;
         if (!SafeReadSymbol(s->literal_htree, br, &literal)) {
           result = BROTLI_DECODER_NEEDS_MORE_INPUT;
           goto saveStateAndReturn;
         }
         s->ringbuffer[pos] = (uint8_t)literal;
-      }
-      --s->block_length[0];
-      BROTLI_LOG_ARRAY_INDEX(s->ringbuffer, pos);
-      ++pos;
-      if (BROTLI_PREDICT_FALSE(pos == s->ringbuffer_size)) {
-        s->state = BROTLI_STATE_COMMAND_INNER_WRITE;
-        --i;
-        goto saveStateAndReturn;
-      }
-    } while (--i != 0);
+        --s->block_length[0];
+        BROTLI_LOG_ARRAY_INDEX(s->ringbuffer, pos);
+        ++pos;
+        if (BROTLI_PREDICT_FALSE(pos == s->ringbuffer_size)) {
+          s->state = BROTLI_STATE_COMMAND_INNER_WRITE;
+          --i;
+          goto saveStateAndReturn;
+        }
+      } while (--i != 0);
+    }
   } else {
     uint8_t p1 = s->ringbuffer[(pos - 1) & s->ringbuffer_mask];
     uint8_t p2 = s->ringbuffer[(pos - 2) & s->ringbuffer_mask];