Special implementation for string hash with sizes greater than 64. AES instructions are used, when available. We load blocks of 64 bytes of the string into 4 independently hashed 128-bit vectors. We use AES encrypt and decrypt to mix the bits. Instructions are running in parallel. Last <=64 bytes are loaded to 4 (or 2 if rest length is <=32) overlapping vectors and encrypted additionally. At the end we mix by another encryption similar to the case in 33-64. ``` name CYCLES/op CYCLES/op vs base BM_HASHING_Combine_contiguous_Fleet_hot 479.0m ± 1% 437.0m ± 0% -8.77% (p=0.000 n=30) BM_HASHING_Combine_contiguous_Fleet_cold 1.700 ± 2% 1.526 ± 2% -10.24% (p=0.000 n=30) arcadia-rome: BM_HASHING_Combine_contiguous_Fleet_hot 465.0m ± 1% 452.0m ± 1% -2.80% (p=0.000 n=30) BM_HASHING_Combine_contiguous_Fleet_cold 4.024 ± 1% 3.676 ± 0% -8.66% (p=0.000 n=30) ``` ASM analysis https://godbolt.org/z/5EzEnT46j shows 8 cycles savings for 128 byte string. We also perform 2x less load operations. PiperOrigin-RevId: 842818076 Change-Id: Ib89f25e0bae2c8ba9ed340350408c27afe6fd222
diff --git a/absl/hash/hash_test.cc b/absl/hash/hash_test.cc index 500d1e0..89e0470 100644 --- a/absl/hash/hash_test.cc +++ b/absl/hash/hash_test.cc
@@ -1284,7 +1284,7 @@ constexpr char kMinChar = 0; constexpr char kMaxChar = 64; // These sizes cover the different hashing cases. - for (size_t size : {8u, 16u, 32u, 64u}) { + for (size_t size : {8u, 16u, 32u, 64u, 128u}) { for (size_t b = 0; b < size - 1; ++b) { absl::flat_hash_set<std::string> set; std::string s(size, '\0');
diff --git a/absl/hash/internal/hash.cc b/absl/hash/internal/hash.cc index 6b02f19..b8a0016 100644 --- a/absl/hash/internal/hash.cc +++ b/absl/hash/internal/hash.cc
@@ -26,7 +26,6 @@ #include "absl/base/prefetch.h" #include "absl/hash/internal/city.h" - #ifdef ABSL_AES_INTERNAL_HAVE_X86_SIMD #error ABSL_AES_INTERNAL_HAVE_X86_SIMD cannot be directly set #elif defined(__SSE4_2__) && defined(__AES__) @@ -46,18 +45,20 @@ namespace { -uint64_t Mix32Bytes(const uint8_t* ptr, uint64_t current_state) { - uint64_t a = absl::base_internal::UnalignedLoad64(ptr); - uint64_t b = absl::base_internal::UnalignedLoad64(ptr + 8); - uint64_t c = absl::base_internal::UnalignedLoad64(ptr + 16); - uint64_t d = absl::base_internal::UnalignedLoad64(ptr + 24); - - uint64_t cs0 = Mix(a ^ kStaticRandomData[1], b ^ current_state); - uint64_t cs1 = Mix(c ^ kStaticRandomData[2], d ^ current_state); - return cs0 ^ cs1; +void PrefetchFutureDataToLocalCache(const uint8_t* ptr) { + PrefetchToLocalCache(ptr + 5 * ABSL_CACHELINE_SIZE); } #ifdef ABSL_AES_INTERNAL_HAVE_X86_SIMD +uint64_t Mix4x16Vectors(__m128i a, __m128i b, __m128i c, __m128i d) { + // res128 = decrypt(a + c, d) + decrypt(b + d, a) + auto res128 = _mm_add_epi64(_mm_aesenc_si128(_mm_add_epi64(a, c), d), + _mm_aesdec_si128(_mm_sub_epi64(b, d), a)); + auto x64 = static_cast<uint64_t>(_mm_cvtsi128_si64(res128)); + auto y64 = static_cast<uint64_t>(_mm_extract_epi64(res128, 1)); + return x64 ^ y64; +} + uint64_t LowLevelHash33To64(uint64_t seed, const uint8_t* ptr, size_t len) { assert(len > 32); assert(len <= 64); @@ -84,13 +85,82 @@ // We perform another round of encryption to mix bits between two halves of // the input. - auto res128 = _mm_add_epi64(_mm_aesenc_si128(_mm_add_epi64(na, nc), nd), - _mm_aesdec_si128(_mm_sub_epi64(nb, nd), na)); - auto x64 = static_cast<uint64_t>(_mm_cvtsi128_si64(res128)); - auto y64 = static_cast<uint64_t>(_mm_extract_epi64(res128, 1)); - return x64 ^ y64; + return Mix4x16Vectors(na, nb, nc, nd); +} + +[[maybe_unused]] ABSL_ATTRIBUTE_NOINLINE uint64_t +LowLevelHashLenGt64(uint64_t seed, const void* data, size_t len) { + assert(len > 64); + const uint8_t* ptr = static_cast<const uint8_t*>(data); + const uint8_t* last_32_ptr = ptr + len - 32; + + // If we have more than 64 bytes, we're going to handle chunks of 64 + // bytes at a time. We're going to build up four separate hash states + // which we will then hash together. This avoids short dependency chains. + __m128i state0 = + _mm_set_epi64x(static_cast<int64_t>(seed), static_cast<int64_t>(len)); + __m128i state1 = state0; + __m128i state2 = state1; + __m128i state3 = state2; + + // Mixing two 128-bit vectors at a time with corresponding states. + // All variables are mixed slightly differently to avoid hash collision + // due to trivial byte rotation. + // We combine state and data with _mm_add_epi64/_mm_sub_epi64 before applying + // AES encryption to make hash function dependent on the order of the blocks. + // See comments in LowLevelHash33To64 for more considerations. + auto mix_ab = [&state0, + &state1](const uint8_t* p) ABSL_ATTRIBUTE_ALWAYS_INLINE { + // i128 a = *p; + // i128 b = *(p + 16); + // state0 = decrypt(state0 + a, state0); + // state1 = decrypt(state1 - b, state1); + auto a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p)); + auto b = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + 16)); + state0 = _mm_aesdec_si128(_mm_add_epi64(state0, a), state0); + state1 = _mm_aesdec_si128(_mm_sub_epi64(state1, b), state1); + }; + auto mix_cd = [&state2, + &state3](const uint8_t* p) ABSL_ATTRIBUTE_ALWAYS_INLINE { + // i128 c = *p; + // i128 d = *(p + 16); + // state2 = encrypt(state2 + c, state2); + // state3 = encrypt(state3 - d, state3); + auto c = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p)); + auto d = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + 16)); + state2 = _mm_aesenc_si128(_mm_add_epi64(state2, c), state2); + state3 = _mm_aesenc_si128(_mm_sub_epi64(state3, d), state3); + }; + + do { + PrefetchFutureDataToLocalCache(ptr); + mix_ab(ptr); + mix_cd(ptr + 32); + + ptr += 64; + len -= 64; + } while (len > 64); + + // We now have a data `ptr` with at most 64 bytes. + if (len > 32) { + mix_ab(ptr); + } + mix_cd(last_32_ptr); + + return Mix4x16Vectors(state0, state1, state2, state3); } #else +uint64_t Mix32Bytes(const uint8_t* ptr, uint64_t current_state) { + uint64_t a = absl::base_internal::UnalignedLoad64(ptr); + uint64_t b = absl::base_internal::UnalignedLoad64(ptr + 8); + uint64_t c = absl::base_internal::UnalignedLoad64(ptr + 16); + uint64_t d = absl::base_internal::UnalignedLoad64(ptr + 24); + + uint64_t cs0 = Mix(a ^ kStaticRandomData[1], b ^ current_state); + uint64_t cs1 = Mix(c ^ kStaticRandomData[2], d ^ current_state); + return cs0 ^ cs1; +} + uint64_t LowLevelHash33To64(uint64_t seed, const uint8_t* ptr, size_t len) { assert(len > 32); assert(len <= 64); @@ -98,7 +168,6 @@ const uint8_t* last_32_ptr = ptr + len - 32; return Mix32Bytes(last_32_ptr, Mix32Bytes(ptr, current_state)); } -#endif // ABSL_AES_INTERNAL_HAVE_X86_SIMD [[maybe_unused]] ABSL_ATTRIBUTE_NOINLINE uint64_t LowLevelHashLenGt64(uint64_t seed, const void* data, size_t len) { @@ -114,7 +183,7 @@ uint64_t duplicated_state2 = current_state; do { - PrefetchToLocalCache(ptr + 5 * ABSL_CACHELINE_SIZE); + PrefetchFutureDataToLocalCache(ptr); uint64_t a = absl::base_internal::UnalignedLoad64(ptr); uint64_t b = absl::base_internal::UnalignedLoad64(ptr + 8); @@ -148,6 +217,7 @@ // safely read from `ptr + len - 32`. return Mix32Bytes(last_32_ptr, current_state); } +#endif // ABSL_AES_INTERNAL_HAVE_X86_SIMD [[maybe_unused]] uint64_t LowLevelHashLenGt32(uint64_t seed, const void* data, size_t len) {
diff --git a/absl/hash/internal/low_level_hash_test.cc b/absl/hash/internal/low_level_hash_test.cc index cdd279b..d054337 100644 --- a/absl/hash/internal/low_level_hash_test.cc +++ b/absl/hash/internal/low_level_hash_test.cc
@@ -380,28 +380,28 @@ 0xe4c78173c7ea537b, 0x0bbdc2bcabdb50b1, 0xd9aa134df2d87623, 0x6c4907c9477a9409, 0xc3e418a5dbda52e5, 0x4d24f3e9d0dda93a, 0xcdb565a363dbe45f, 0xa95f228c8ee57478, 0x6b8f00bab5130227, - 0x2d05a0f44818b67a, 0xa64b55b071afbbea, 0xa205bfe6c724ce4d, - 0x69dd26ca8ac21744, 0xef80e2ff2f6a9bc0, 0xde266c0baa202c20, - 0xfa3463080ac74c50, 0x379d968a40125c2b, 0x4cbbd0a7b3c7d648, - 0xc92afd93f4c665d2, 0x6e28f5adb7ae38dc, 0x7c689c9c237be35e, - 0xaea41b29bd9d0f73, 0x832cef631d77e59f, 0x70cac8e87bc37dd3, - 0x8e8c98bbde68e764, 0xd6117aeb3ddedded, 0xd796ab808e766240, - 0x8953d0ea1a7d9814, 0xa212eba4281b391c, 0x21a555a8939ce597, - 0x809d31660f6d81a8, 0x2356524b20ab400f, 0x5bc611e1e49d0478, - 0xba9c065e2f385ce2, 0xb0a0fd12f4e83899, 0x14d076a35b1ff2ca, - 0x8acd0bb8cf9a93c0, 0xe62e8ec094039ee4, 0x38a536a7072bdc61, - 0xca256297602524f8, 0xfc62ebfb3530caeb, 0x8d8b0c05520569f6, - 0xbbaca65cf154c59d, 0x3739b5ada7e338d3, 0xdb9ea31f47365340, - 0x410b5c9c1da56755, 0x7e0abc03dbd10283, 0x136f87be70ed442e, - 0x6b727d4feddbe1e9, 0x074ebb21183b01df, 0x3fe92185b1985484, - 0xc5d8efd3c68305ca, 0xd9bada21b17e272e, 0x64d73133e1360f83, - 0xeb8563aa993e21f9, 0xe5e8da50cceab28f, 0x7a6f92eb3223d2f3, - 0xbdaf98370ea9b31b, 0x1682a84457f077bc, 0x4abd2d33b6e3be37, - 0xb35bc81a7c9d4c04, 0x3e5bde3fb7cfe63d, 0xff3abe6e2ffec974, - 0xb8116dd26cf6feec, 0x7a77a6e4ed0cf081, 0xb71eec2d5a184316, - 0x6fa932f77b4da817, 0x795f79b33909b2c4, 0x1b8755ef6b5eb34e, - 0x2255b72d7d6b2d79, 0xf2bdafafa90bd50a, 0x442a578f02cb1fc8, - 0xc25aefe55ecf83db, 0x3114c056f9c5a676, + 0x2d05a0f44818b67a, 0xd6bf7d990b5f44cb, 0xa3608bdb4712861a, + 0xf20c33e5e355330b, 0xbc86e1b13130180d, 0x0848221b397b839a, + 0x17cc0acf44a7e210, 0xc18c6dc584fe0f62, 0x896c7858a59f991d, + 0xeab1e6d7d2856ed7, 0x7e4b2d99c23edc51, 0x9aeeeb7fa46e7cf0, + 0x161b9f2e3611790f, 0x5f82aae18d971b36, 0x8d0dd9965881e162, + 0x56700ea26285895a, 0xcd919c86c29a053e, 0x3e5d589282d9a722, + 0x92caee9f48a66604, 0x7e1a2fd9b06f14b0, 0xce1d5293f95b0178, + 0x8101361290e70a11, 0x570e3e9c9eafc1c6, 0x77b6241926a7a568, + 0x313e5cb34f346699, 0xab8ebeab0514b82b, 0x6e0a43763a310408, + 0x761b76ec22b2e440, 0x4238c84a9ec00528, 0xb9ea1f6d4d5552af, + 0xd21f8f110b9dc060, 0xb3d3842b69ac3689, 0xd0a88aa1dcf59869, + 0xf3f69f637b123403, 0xf5f34b1068cac7da, 0xe69a08d604774abf, + 0x57648d3a73332437, 0x9762947f5013d00d, 0x35c5d734a0015922, + 0xbee2fe5a104ce209, 0xedb060efa6efca34, 0x5ccf0f4786d97bc2, + 0x1ef8ed72e80d7bef, 0x58522deb49c5e30f, 0xde97cd2a6f8bd13b, + 0x3fae37c6f9855d09, 0xea99ae786feca261, 0x8c6d1d46670b0943, + 0x84658b2a232c7bfb, 0x7058b7a7968de394, 0x0d44fba68e25aa8f, + 0xc7f687020f8eb00b, 0xbf9671e1196153d6, 0x1009be891b7f83e7, + 0x4f9457fb4aa12865, 0x30a49d9563643b32, 0x0302e2c5b46d5a3a, + 0x77553f42fb0bfbf7, 0x26b95e89f0077110, 0x76ce68ebe01191ba, + 0x724110fb509e4376, 0xebe74b016b5cfb88, 0x3b0fe11dcf175fc9, + 0x20b737b9c0490538, 0x0db21c429b45fd17, }; #else constexpr uint64_t kGolden[kNumGoldenOutputs] = {