Prefetch the backreference hashtable bucket.

Place the prefetch before the last distance checks, to give the prefetch enough time to work.

PiperOrigin-RevId: 609192361
diff --git a/c/common/platform.h b/c/common/platform.h
index dbba942..c18ff02 100644
--- a/c/common/platform.h
+++ b/c/common/platform.h
@@ -519,6 +519,21 @@
 #if BROTLI_ENABLE_DUMP
   BROTLI_UNUSED(&BrotliDump);
 #endif
+
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) && !defined(_M_ARM64EC)  /* _mm_prefetch() is not defined outside of x86/x64 */
+#  include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#  define PREFETCH_L1(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#  define PREFETCH_L2(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
+#elif BROTLI_GNUC_HAS_BUILTIN(__builtin_prefetch, 3, 1, 0)
+#  define PREFETCH_L1(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#  define PREFETCH_L2(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
+#elif defined(__aarch64__)
+#  define PREFETCH_L1(ptr)  do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0)
+#  define PREFETCH_L2(ptr)  do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0)
+#else
+#  define PREFETCH_L1(ptr) do { (void)(ptr); } while (0)  /* disabled */
+#  define PREFETCH_L2(ptr) do { (void)(ptr); } while (0)  /* disabled */
+#endif
 }
 
 #endif  /* BROTLI_COMMON_PLATFORM_H_ */
diff --git a/c/enc/hash_longest_match64_inc.h b/c/enc/hash_longest_match64_inc.h
index 441ab86..7e0b2f5 100644
--- a/c/enc/hash_longest_match64_inc.h
+++ b/c/enc/hash_longest_match64_inc.h
@@ -170,6 +170,11 @@
   score_t best_score = out->score;
   size_t best_len = out->len;
   size_t i;
+  /* Precalculate the hash key and prefetch the bucket. */
+  const size_t key = FN(HashBytes)(&data[cur_ix_masked], self->hash_mul_);
+  uint32_t* BROTLI_RESTRICT bucket = &buckets[key << self->block_bits_];
+  PREFETCH_L1(bucket);
+  if (self->block_bits_ > 4) PREFETCH_L1(bucket + 16);
   out->len = 0;
   out->len_code_delta = 0;
 
@@ -220,8 +225,6 @@
     best_len = 3;
   }
   {
-    const size_t key = FN(HashBytes)(&data[cur_ix_masked], self->hash_mul_);
-    uint32_t* BROTLI_RESTRICT bucket = &buckets[key << self->block_bits_];
     const size_t down =
         (num[key] > self->block_size_) ?
         (num[key] - self->block_size_) : 0u;
diff --git a/c/enc/hash_longest_match_inc.h b/c/enc/hash_longest_match_inc.h
index 4679dac..b744f4d 100644
--- a/c/enc/hash_longest_match_inc.h
+++ b/c/enc/hash_longest_match_inc.h
@@ -169,11 +169,17 @@
   score_t best_score = out->score;
   size_t best_len = out->len;
   size_t i;
+  /* Precalculate the hash key and prefetch the bucket. */
+  const uint32_t key =
+      FN(HashBytes)(&data[cur_ix_masked], self->hash_shift_);
+  uint32_t* BROTLI_RESTRICT bucket = &buckets[key << self->block_bits_];
+  PREFETCH_L1(bucket);
+  if (self->block_bits_ > 4) PREFETCH_L1(bucket + 16);
+  out->len = 0;
+  out->len_code_delta = 0;
 
   BROTLI_DCHECK(cur_ix_masked + max_length <= ring_buffer_mask);
 
-  out->len = 0;
-  out->len_code_delta = 0;
   /* Try last distance first. */
   for (i = 0; i < (size_t)self->num_last_distances_to_check_; ++i) {
     const size_t backward = (size_t)distance_cache[i];
@@ -219,9 +225,6 @@
     best_len = 3;
   }
   {
-    const uint32_t key =
-        FN(HashBytes)(&data[cur_ix_masked], self->hash_shift_);
-    uint32_t* BROTLI_RESTRICT bucket = &buckets[key << self->block_bits_];
     const size_t down =
         (num[key] > self->block_size_) ? (num[key] - self->block_size_) : 0u;
     for (i = num[key]; i > down;) {