Make Brotli decompression faster
Makes it ~8% faster on my skylake desktop.
PiperOrigin-RevId: 689499172
diff --git a/c/dec/decode.c b/c/dec/decode.c
index 3ed9fe8..921aa4a 100644
--- a/c/dec/decode.c
+++ b/c/dec/decode.c
@@ -2006,35 +2006,72 @@
brotli_reg_t bits;
brotli_reg_t value;
PreloadSymbol(safe, s->literal_htree, br, &bits, &value);
- do {
- if (!CheckInputAmount(safe, br)) {
- s->state = BROTLI_STATE_COMMAND_INNER;
- result = BROTLI_DECODER_NEEDS_MORE_INPUT;
- goto saveStateAndReturn;
+ if (!safe) {
+ // This is a hottest part of the decode, so we copy the loop below
+ // and optimize it by calculating the number of steps where all checks
+ // evaluate to false (ringbuffer size/block size/input size).
+ // Since all checks are loop invariant, we just need to find
+ // minimal number of iterations for a simple loop, and run
+ // the full version for the remainder.
+ int num_steps = i - 1;
+ if (num_steps > 0 && ((brotli_reg_t)(num_steps) > s->block_length[0])) {
+ // Safe cast, since block_length < steps
+ num_steps = (int)s->block_length[0];
}
- if (BROTLI_PREDICT_FALSE(s->block_length[0] == 0)) {
- goto NextLiteralBlock;
+ if (s->ringbuffer_size >= pos &&
+ (s->ringbuffer_size - pos) <= num_steps) {
+ num_steps = s->ringbuffer_size - pos - 1;
}
- if (!safe) {
+ if (num_steps < 0) {
+ num_steps = 0;
+ }
+ num_steps = BrotliCopyPreloadedSymbolsToU8(s->literal_htree, br, &bits,
+ &value, s->ringbuffer, pos,
+ num_steps);
+ pos += num_steps;
+ s->block_length[0] -= (brotli_reg_t)num_steps;
+ i -= num_steps;
+ do {
+ if (!CheckInputAmount(safe, br)) {
+ s->state = BROTLI_STATE_COMMAND_INNER;
+ result = BROTLI_DECODER_NEEDS_MORE_INPUT;
+ goto saveStateAndReturn;
+ }
+ if (BROTLI_PREDICT_FALSE(s->block_length[0] == 0)) {
+ goto NextLiteralBlock;
+ }
BrotliCopyPreloadedSymbolsToU8(s->literal_htree, br, &bits, &value,
s->ringbuffer, pos, 1);
- } else {
+ --s->block_length[0];
+ BROTLI_LOG_ARRAY_INDEX(s->ringbuffer, pos);
+ ++pos;
+ if (BROTLI_PREDICT_FALSE(pos == s->ringbuffer_size)) {
+ s->state = BROTLI_STATE_COMMAND_INNER_WRITE;
+ --i;
+ goto saveStateAndReturn;
+ }
+ } while (--i != 0);
+ } else { /* safe */
+ do {
+ if (BROTLI_PREDICT_FALSE(s->block_length[0] == 0)) {
+ goto NextLiteralBlock;
+ }
brotli_reg_t literal;
if (!SafeReadSymbol(s->literal_htree, br, &literal)) {
result = BROTLI_DECODER_NEEDS_MORE_INPUT;
goto saveStateAndReturn;
}
s->ringbuffer[pos] = (uint8_t)literal;
- }
- --s->block_length[0];
- BROTLI_LOG_ARRAY_INDEX(s->ringbuffer, pos);
- ++pos;
- if (BROTLI_PREDICT_FALSE(pos == s->ringbuffer_size)) {
- s->state = BROTLI_STATE_COMMAND_INNER_WRITE;
- --i;
- goto saveStateAndReturn;
- }
- } while (--i != 0);
+ --s->block_length[0];
+ BROTLI_LOG_ARRAY_INDEX(s->ringbuffer, pos);
+ ++pos;
+ if (BROTLI_PREDICT_FALSE(pos == s->ringbuffer_size)) {
+ s->state = BROTLI_STATE_COMMAND_INNER_WRITE;
+ --i;
+ goto saveStateAndReturn;
+ }
+ } while (--i != 0);
+ }
} else {
uint8_t p1 = s->ringbuffer[(pos - 1) & s->ringbuffer_mask];
uint8_t p2 = s->ringbuffer[(pos - 2) & s->ringbuffer_mask];