Special implementation for string hash with sizes greater than 64.

AES instructions are used, when available. We load blocks of 64 bytes of the string into 4 independently hashed 128-bit vectors. We use AES encrypt and decrypt to mix the bits. Instructions are running in parallel.

Last <=64 bytes are loaded to 4 (or 2 if rest length is <=32) overlapping vectors and encrypted additionally. At the end we mix by another encryption similar to the case in 33-64.

```
name                                     CYCLES/op     CYCLES/op   vs base
BM_HASHING_Combine_contiguous_Fleet_hot  479.0m ± 1%   437.0m ± 0%   -8.77% (p=0.000 n=30)
BM_HASHING_Combine_contiguous_Fleet_cold 1.700 ± 2%    1.526 ± 2%  -10.24% (p=0.000 n=30)

arcadia-rome:
BM_HASHING_Combine_contiguous_Fleet_hot  465.0m ± 1%   452.0m ± 1%   -2.80% (p=0.000 n=30)
BM_HASHING_Combine_contiguous_Fleet_cold 4.024 ± 1%    3.676 ± 0%   -8.66% (p=0.000 n=30)
```

ASM analysis https://godbolt.org/z/5EzEnT46j shows 8 cycles savings for 128 byte string. We also perform 2x less load operations.

PiperOrigin-RevId: 842818076
Change-Id: Ib89f25e0bae2c8ba9ed340350408c27afe6fd222
diff --git a/absl/hash/hash_test.cc b/absl/hash/hash_test.cc
index 500d1e0..89e0470 100644
--- a/absl/hash/hash_test.cc
+++ b/absl/hash/hash_test.cc
@@ -1284,7 +1284,7 @@
   constexpr char kMinChar = 0;
   constexpr char kMaxChar = 64;
   // These sizes cover the different hashing cases.
-  for (size_t size : {8u, 16u, 32u, 64u}) {
+  for (size_t size : {8u, 16u, 32u, 64u, 128u}) {
     for (size_t b = 0; b < size - 1; ++b) {
       absl::flat_hash_set<std::string> set;
       std::string s(size, '\0');
diff --git a/absl/hash/internal/hash.cc b/absl/hash/internal/hash.cc
index 6b02f19..b8a0016 100644
--- a/absl/hash/internal/hash.cc
+++ b/absl/hash/internal/hash.cc
@@ -26,7 +26,6 @@
 #include "absl/base/prefetch.h"
 #include "absl/hash/internal/city.h"
 
-
 #ifdef ABSL_AES_INTERNAL_HAVE_X86_SIMD
 #error ABSL_AES_INTERNAL_HAVE_X86_SIMD cannot be directly set
 #elif defined(__SSE4_2__) && defined(__AES__)
@@ -46,18 +45,20 @@
 
 namespace {
 
-uint64_t Mix32Bytes(const uint8_t* ptr, uint64_t current_state) {
-  uint64_t a = absl::base_internal::UnalignedLoad64(ptr);
-  uint64_t b = absl::base_internal::UnalignedLoad64(ptr + 8);
-  uint64_t c = absl::base_internal::UnalignedLoad64(ptr + 16);
-  uint64_t d = absl::base_internal::UnalignedLoad64(ptr + 24);
-
-  uint64_t cs0 = Mix(a ^ kStaticRandomData[1], b ^ current_state);
-  uint64_t cs1 = Mix(c ^ kStaticRandomData[2], d ^ current_state);
-  return cs0 ^ cs1;
+void PrefetchFutureDataToLocalCache(const uint8_t* ptr) {
+  PrefetchToLocalCache(ptr + 5 * ABSL_CACHELINE_SIZE);
 }
 
 #ifdef ABSL_AES_INTERNAL_HAVE_X86_SIMD
+uint64_t Mix4x16Vectors(__m128i a, __m128i b, __m128i c, __m128i d) {
+  // res128 = decrypt(a + c, d) + decrypt(b + d, a)
+  auto res128 = _mm_add_epi64(_mm_aesenc_si128(_mm_add_epi64(a, c), d),
+                              _mm_aesdec_si128(_mm_sub_epi64(b, d), a));
+  auto x64 = static_cast<uint64_t>(_mm_cvtsi128_si64(res128));
+  auto y64 = static_cast<uint64_t>(_mm_extract_epi64(res128, 1));
+  return x64 ^ y64;
+}
+
 uint64_t LowLevelHash33To64(uint64_t seed, const uint8_t* ptr, size_t len) {
   assert(len > 32);
   assert(len <= 64);
@@ -84,13 +85,82 @@
 
   // We perform another round of encryption to mix bits between two halves of
   // the input.
-  auto res128 = _mm_add_epi64(_mm_aesenc_si128(_mm_add_epi64(na, nc), nd),
-                              _mm_aesdec_si128(_mm_sub_epi64(nb, nd), na));
-  auto x64 = static_cast<uint64_t>(_mm_cvtsi128_si64(res128));
-  auto y64 = static_cast<uint64_t>(_mm_extract_epi64(res128, 1));
-  return x64 ^ y64;
+  return Mix4x16Vectors(na, nb, nc, nd);
+}
+
+[[maybe_unused]] ABSL_ATTRIBUTE_NOINLINE uint64_t
+LowLevelHashLenGt64(uint64_t seed, const void* data, size_t len) {
+  assert(len > 64);
+  const uint8_t* ptr = static_cast<const uint8_t*>(data);
+  const uint8_t* last_32_ptr = ptr + len - 32;
+
+  // If we have more than 64 bytes, we're going to handle chunks of 64
+  // bytes at a time. We're going to build up four separate hash states
+  // which we will then hash together. This avoids short dependency chains.
+  __m128i state0 =
+      _mm_set_epi64x(static_cast<int64_t>(seed), static_cast<int64_t>(len));
+  __m128i state1 = state0;
+  __m128i state2 = state1;
+  __m128i state3 = state2;
+
+  // Mixing two 128-bit vectors at a time with corresponding states.
+  // All variables are mixed slightly differently to avoid hash collision
+  // due to trivial byte rotation.
+  // We combine state and data with _mm_add_epi64/_mm_sub_epi64 before applying
+  // AES encryption to make hash function dependent on the order of the blocks.
+  // See comments in LowLevelHash33To64 for more considerations.
+  auto mix_ab = [&state0,
+                 &state1](const uint8_t* p) ABSL_ATTRIBUTE_ALWAYS_INLINE {
+    // i128 a = *p;
+    // i128 b = *(p + 16);
+    // state0 = decrypt(state0 + a, state0);
+    // state1 = decrypt(state1 - b, state1);
+    auto a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
+    auto b = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + 16));
+    state0 = _mm_aesdec_si128(_mm_add_epi64(state0, a), state0);
+    state1 = _mm_aesdec_si128(_mm_sub_epi64(state1, b), state1);
+  };
+  auto mix_cd = [&state2,
+                 &state3](const uint8_t* p) ABSL_ATTRIBUTE_ALWAYS_INLINE {
+    // i128 c = *p;
+    // i128 d = *(p + 16);
+    // state2 = encrypt(state2 + c, state2);
+    // state3 = encrypt(state3 - d, state3);
+    auto c = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
+    auto d = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + 16));
+    state2 = _mm_aesenc_si128(_mm_add_epi64(state2, c), state2);
+    state3 = _mm_aesenc_si128(_mm_sub_epi64(state3, d), state3);
+  };
+
+  do {
+    PrefetchFutureDataToLocalCache(ptr);
+    mix_ab(ptr);
+    mix_cd(ptr + 32);
+
+    ptr += 64;
+    len -= 64;
+  } while (len > 64);
+
+  // We now have a data `ptr` with at most 64 bytes.
+  if (len > 32) {
+    mix_ab(ptr);
+  }
+  mix_cd(last_32_ptr);
+
+  return Mix4x16Vectors(state0, state1, state2, state3);
 }
 #else
+uint64_t Mix32Bytes(const uint8_t* ptr, uint64_t current_state) {
+  uint64_t a = absl::base_internal::UnalignedLoad64(ptr);
+  uint64_t b = absl::base_internal::UnalignedLoad64(ptr + 8);
+  uint64_t c = absl::base_internal::UnalignedLoad64(ptr + 16);
+  uint64_t d = absl::base_internal::UnalignedLoad64(ptr + 24);
+
+  uint64_t cs0 = Mix(a ^ kStaticRandomData[1], b ^ current_state);
+  uint64_t cs1 = Mix(c ^ kStaticRandomData[2], d ^ current_state);
+  return cs0 ^ cs1;
+}
+
 uint64_t LowLevelHash33To64(uint64_t seed, const uint8_t* ptr, size_t len) {
   assert(len > 32);
   assert(len <= 64);
@@ -98,7 +168,6 @@
   const uint8_t* last_32_ptr = ptr + len - 32;
   return Mix32Bytes(last_32_ptr, Mix32Bytes(ptr, current_state));
 }
-#endif  // ABSL_AES_INTERNAL_HAVE_X86_SIMD
 
 [[maybe_unused]] ABSL_ATTRIBUTE_NOINLINE uint64_t
 LowLevelHashLenGt64(uint64_t seed, const void* data, size_t len) {
@@ -114,7 +183,7 @@
   uint64_t duplicated_state2 = current_state;
 
   do {
-    PrefetchToLocalCache(ptr + 5 * ABSL_CACHELINE_SIZE);
+    PrefetchFutureDataToLocalCache(ptr);
 
     uint64_t a = absl::base_internal::UnalignedLoad64(ptr);
     uint64_t b = absl::base_internal::UnalignedLoad64(ptr + 8);
@@ -148,6 +217,7 @@
   // safely read from `ptr + len - 32`.
   return Mix32Bytes(last_32_ptr, current_state);
 }
+#endif  // ABSL_AES_INTERNAL_HAVE_X86_SIMD
 
 [[maybe_unused]] uint64_t LowLevelHashLenGt32(uint64_t seed, const void* data,
                                               size_t len) {
diff --git a/absl/hash/internal/low_level_hash_test.cc b/absl/hash/internal/low_level_hash_test.cc
index cdd279b..d054337 100644
--- a/absl/hash/internal/low_level_hash_test.cc
+++ b/absl/hash/internal/low_level_hash_test.cc
@@ -380,28 +380,28 @@
       0xe4c78173c7ea537b, 0x0bbdc2bcabdb50b1, 0xd9aa134df2d87623,
       0x6c4907c9477a9409, 0xc3e418a5dbda52e5, 0x4d24f3e9d0dda93a,
       0xcdb565a363dbe45f, 0xa95f228c8ee57478, 0x6b8f00bab5130227,
-      0x2d05a0f44818b67a, 0xa64b55b071afbbea, 0xa205bfe6c724ce4d,
-      0x69dd26ca8ac21744, 0xef80e2ff2f6a9bc0, 0xde266c0baa202c20,
-      0xfa3463080ac74c50, 0x379d968a40125c2b, 0x4cbbd0a7b3c7d648,
-      0xc92afd93f4c665d2, 0x6e28f5adb7ae38dc, 0x7c689c9c237be35e,
-      0xaea41b29bd9d0f73, 0x832cef631d77e59f, 0x70cac8e87bc37dd3,
-      0x8e8c98bbde68e764, 0xd6117aeb3ddedded, 0xd796ab808e766240,
-      0x8953d0ea1a7d9814, 0xa212eba4281b391c, 0x21a555a8939ce597,
-      0x809d31660f6d81a8, 0x2356524b20ab400f, 0x5bc611e1e49d0478,
-      0xba9c065e2f385ce2, 0xb0a0fd12f4e83899, 0x14d076a35b1ff2ca,
-      0x8acd0bb8cf9a93c0, 0xe62e8ec094039ee4, 0x38a536a7072bdc61,
-      0xca256297602524f8, 0xfc62ebfb3530caeb, 0x8d8b0c05520569f6,
-      0xbbaca65cf154c59d, 0x3739b5ada7e338d3, 0xdb9ea31f47365340,
-      0x410b5c9c1da56755, 0x7e0abc03dbd10283, 0x136f87be70ed442e,
-      0x6b727d4feddbe1e9, 0x074ebb21183b01df, 0x3fe92185b1985484,
-      0xc5d8efd3c68305ca, 0xd9bada21b17e272e, 0x64d73133e1360f83,
-      0xeb8563aa993e21f9, 0xe5e8da50cceab28f, 0x7a6f92eb3223d2f3,
-      0xbdaf98370ea9b31b, 0x1682a84457f077bc, 0x4abd2d33b6e3be37,
-      0xb35bc81a7c9d4c04, 0x3e5bde3fb7cfe63d, 0xff3abe6e2ffec974,
-      0xb8116dd26cf6feec, 0x7a77a6e4ed0cf081, 0xb71eec2d5a184316,
-      0x6fa932f77b4da817, 0x795f79b33909b2c4, 0x1b8755ef6b5eb34e,
-      0x2255b72d7d6b2d79, 0xf2bdafafa90bd50a, 0x442a578f02cb1fc8,
-      0xc25aefe55ecf83db, 0x3114c056f9c5a676,
+      0x2d05a0f44818b67a, 0xd6bf7d990b5f44cb, 0xa3608bdb4712861a,
+      0xf20c33e5e355330b, 0xbc86e1b13130180d, 0x0848221b397b839a,
+      0x17cc0acf44a7e210, 0xc18c6dc584fe0f62, 0x896c7858a59f991d,
+      0xeab1e6d7d2856ed7, 0x7e4b2d99c23edc51, 0x9aeeeb7fa46e7cf0,
+      0x161b9f2e3611790f, 0x5f82aae18d971b36, 0x8d0dd9965881e162,
+      0x56700ea26285895a, 0xcd919c86c29a053e, 0x3e5d589282d9a722,
+      0x92caee9f48a66604, 0x7e1a2fd9b06f14b0, 0xce1d5293f95b0178,
+      0x8101361290e70a11, 0x570e3e9c9eafc1c6, 0x77b6241926a7a568,
+      0x313e5cb34f346699, 0xab8ebeab0514b82b, 0x6e0a43763a310408,
+      0x761b76ec22b2e440, 0x4238c84a9ec00528, 0xb9ea1f6d4d5552af,
+      0xd21f8f110b9dc060, 0xb3d3842b69ac3689, 0xd0a88aa1dcf59869,
+      0xf3f69f637b123403, 0xf5f34b1068cac7da, 0xe69a08d604774abf,
+      0x57648d3a73332437, 0x9762947f5013d00d, 0x35c5d734a0015922,
+      0xbee2fe5a104ce209, 0xedb060efa6efca34, 0x5ccf0f4786d97bc2,
+      0x1ef8ed72e80d7bef, 0x58522deb49c5e30f, 0xde97cd2a6f8bd13b,
+      0x3fae37c6f9855d09, 0xea99ae786feca261, 0x8c6d1d46670b0943,
+      0x84658b2a232c7bfb, 0x7058b7a7968de394, 0x0d44fba68e25aa8f,
+      0xc7f687020f8eb00b, 0xbf9671e1196153d6, 0x1009be891b7f83e7,
+      0x4f9457fb4aa12865, 0x30a49d9563643b32, 0x0302e2c5b46d5a3a,
+      0x77553f42fb0bfbf7, 0x26b95e89f0077110, 0x76ce68ebe01191ba,
+      0x724110fb509e4376, 0xebe74b016b5cfb88, 0x3b0fe11dcf175fc9,
+      0x20b737b9c0490538, 0x0db21c429b45fd17,
   };
 #else
   constexpr uint64_t kGolden[kNumGoldenOutputs] = {