Specialize long strings hash for ARM to benefit from AES instruction differences. ``` name CYCLES/op CYCLES/op vs base BM_HASHING_Combine_contiguous_Fleet_hot 540.0m ± 2% 507.0m ± 2% -6.11% (p=0.002 n=6) BM_HASHING_Combine_contiguous_Fleet_cold 2.124 ± 12% 2.027 ± 3% -4.54% (p=0.041 n=6) ``` ASM diff 1. 33-64: -4 cycles https://godbolt.org/z/nEYEKP4M3 2. 65+: -6 cycles https://godbolt.org/z/xsnjh678c 3. -104 bytes of binary size: https://godbolt.org/z/j8absY8M8 PiperOrigin-RevId: 866058683 Change-Id: I7d2f13532ce4fd2bec0382af0ba116967d5aa063
diff --git a/absl/hash/internal/hash.cc b/absl/hash/internal/hash.cc index 74b6d57..43cb561 100644 --- a/absl/hash/internal/hash.cc +++ b/absl/hash/internal/hash.cc
@@ -78,6 +78,9 @@ return _mm_sub_epi64(a, b); } +// Bits of the second argument to Encrypt128/Decrypt128 are XORed with the +// first argument after encryption/decryption. + inline Vector128 Encrypt128(Vector128 data, Vector128 key) { return _mm_aesenc_si128(data, key); } @@ -86,6 +89,28 @@ return _mm_aesdec_si128(data, key); } +// We use each value as the first argument to shuffle all the bits around. We do +// not add any salt to the state or loaded data, instead we vary instructions +// used to mix bits Encrypt128/Decrypt128 and Add128/Sub128. On x86, +// Add128/Sub128 are combined to one instruction with data loading like +// `vpaddq xmm1, xmm0, xmmword ptr [rdi]`. + +inline Vector128 MixA(Vector128 a, Vector128 state) { + return Decrypt128(Add128(state, a), state); +} + +inline Vector128 MixB(Vector128 b, Vector128 state) { + return Decrypt128(Sub128(state, b), state); +} + +inline Vector128 MixC(Vector128 c, Vector128 state) { + return Encrypt128(Add128(state, c), state); +} + +inline Vector128 MixD(Vector128 d, Vector128 state) { + return Encrypt128(Sub128(state, d), state); +} + inline uint64_t ExtractLow64(Vector128 v) { return static_cast<uint64_t>(_mm_cvtsi128_si64(v)); } @@ -118,37 +143,41 @@ vaddq_u64(vreinterpretq_u64_u8(a), vreinterpretq_u64_u8(b))); } -inline Vector128 Sub128(Vector128 a, Vector128 b) { - return vreinterpretq_u8_u64( - vsubq_u64(vreinterpretq_u64_u8(a), vreinterpretq_u64_u8(b))); -} - -// Encrypt128 and Decrypt128 are equivalent to x86 versions. -// `vaeseq_u8` and `vaesdq_u8` perform `^ key` before encryption/decryption. -// We need to perform `^ key` after encryption/decryption to improve hash -// quality and match x86 version. +// Bits of the second argument to Decrypt128/Encrypt128 are XORed with the +// state argument BEFORE encryption (in x86 version they are XORed after). inline Vector128 Encrypt128(Vector128 data, Vector128 key) { - return vaesmcq_u8(vaeseq_u8(data, Vector128{})) ^ key; -} - -inline Vector128 Decrypt128(Vector128 data, Vector128 key) { - return vaesimcq_u8(vaesdq_u8(data, Vector128{})) ^ key; -} - -// ArmEncrypt128 and ArmDecrypt128 are ARM specific versions that use ARM -// instructions directly. -// We can only use these versions in the second round of encryption/decryption -// to have good hash quality. - -inline Vector128 ArmEncrypt128(Vector128 data, Vector128 key) { return vaesmcq_u8(vaeseq_u8(data, key)); } -inline Vector128 ArmDecrypt128(Vector128 data, Vector128 key) { +inline Vector128 Decrypt128(Vector128 data, Vector128 key) { return vaesimcq_u8(vaesdq_u8(data, key)); } +// We use decryption for a, b and encryption for c, d. That helps us to avoid +// collisions for trivial byte rotations. Mix4x16Vectors later uses +// encrypted/decrypted pairs differently to ensure that the order of blocks is +// important for the hash value. +// We also avoid using Add128/Sub128 instructions because state is being mixed +// before encryption/decryption. On ARM, there is no fusion of load and add/sub +// instructions so it is more expensive to use them. + +inline Vector128 MixA(Vector128 a, Vector128 state) { + return Decrypt128(a, state); +} + +inline Vector128 MixB(Vector128 b, Vector128 state) { + return Decrypt128(b, state); +} + +inline Vector128 MixC(Vector128 c, Vector128 state) { + return Encrypt128(c, state); +} + +inline Vector128 MixD(Vector128 d, Vector128 state) { + return Encrypt128(d, state); +} + inline uint64_t ExtractLow64(Vector128 v) { return vgetq_lane_u64(vreinterpretq_u64_u8(v), 0); } @@ -158,7 +187,7 @@ } uint64_t Mix4x16Vectors(Vector128 a, Vector128 b, Vector128 c, Vector128 d) { - Vector128 res128 = Add128(ArmEncrypt128(a, c), ArmDecrypt128(b, d)); + Vector128 res128 = Add128(Encrypt128(a, c), Decrypt128(b, d)); uint64_t x64 = ExtractLow64(res128); uint64_t y64 = ExtractHigh64(res128); return x64 ^ y64; @@ -176,16 +205,10 @@ Vector128 c = Load128(last32_ptr); Vector128 d = Load128(last32_ptr + 16); - // Bits of the second argument to Decrypt128/Encrypt128 are XORed with the - // state argument after encryption. We use each value as the first argument to - // shuffle all the bits around. We do not add any salt to the state or loaded - // data, instead we vary instructions used to mix bits Decrypt128/Encrypt128 - // and Add128/Sub128. On x86 Add128/Sub128 are combined to one instruction - // with data loading like `vpaddq xmm1, xmm0, xmmword ptr [rdi]`. - Vector128 na = Decrypt128(Add128(state, a), state); - Vector128 nb = Decrypt128(Sub128(state, b), state); - Vector128 nc = Encrypt128(Add128(state, c), state); - Vector128 nd = Encrypt128(Sub128(state, d), state); + Vector128 na = MixA(a, state); + Vector128 nb = MixB(b, state); + Vector128 nc = MixC(c, state); + Vector128 nd = MixD(d, state); // We perform another round of encryption to mix bits between two halves of // the input. @@ -216,15 +239,15 @@ &state1](const uint8_t* p) ABSL_ATTRIBUTE_ALWAYS_INLINE { Vector128 a = Load128(p); Vector128 b = Load128(p + 16); - state0 = Decrypt128(Add128(state0, a), state0); - state1 = Decrypt128(Sub128(state1, b), state1); + state0 = MixA(a, state0); + state1 = MixB(b, state1); }; auto mix_cd = [&state2, &state3](const uint8_t* p) ABSL_ATTRIBUTE_ALWAYS_INLINE { Vector128 c = Load128(p); Vector128 d = Load128(p + 16); - state2 = Encrypt128(Add128(state2, c), state2); - state3 = Encrypt128(Sub128(state3, d), state3); + state2 = MixC(c, state2); + state3 = MixD(d, state3); }; do {
diff --git a/absl/hash/internal/low_level_hash_test.cc b/absl/hash/internal/low_level_hash_test.cc index 6476462..f1e2d15 100644 --- a/absl/hash/internal/low_level_hash_test.cc +++ b/absl/hash/internal/low_level_hash_test.cc
@@ -405,38 +405,38 @@ }; #elif (defined(ABSL_INTERNAL_HAVE_ARM_NEON) && defined(__ARM_FEATURE_CRYPTO)) constexpr uint64_t kGolden[kNumGoldenOutputs] = { - 0x248c82373f7f0d24, 0x0a4f8cbf55047086, 0x498dd0a4445ed65f, - 0x6d418b193638ab0f, 0x4112c1ce9142980e, 0x02132db6a189f206, - 0x36e550ca9139ee44, 0xe0a67fdbd8627314, 0x2b9528fe18f65d1d, - 0x037d9cf48ca9fd1c, 0xa87222631332ca06, 0x31f35e09af065022, - 0xacd6f1d29071c8e5, 0x54eb4f229a0d15a4, 0x132c27c6a747e136, - 0x13d7cb427efe7e2f, 0x71a5d21e00f00dcd, 0xdb909cbf1b0fbb64, - 0x4dc943ad8901fa5b, 0xacf4b3d41a46feb5, 0x12a37d19c14ffd65, - 0xdb511011bcdd95b9, 0xd6d75af1c8b86dbd, 0xc65aefdcff941a8e, - 0xbea311ef212c0306, 0xc49861afe7a6888b, 0x598424b541daf528, - 0x4cc264fbb57c4640, 0x1a7376211a5e674a, 0xcb82900ad06bf89b, - 0xd9d6201d685a4971, 0x77ed120a277ce616, 0x9bd5ad973b4d358c, - 0x880850ff91b6b772, 0xf9d24d40448fce38, 0x870b8ee54a31707d, - 0x613130fe647bd169, 0x04a0cc02a81989f3, 0x998adbda0ab3f8fa, - 0x2b28729269102040, 0xbdb9be95e352a6d5, 0xd5e2a55d78bd9fb0, - 0xef609c2b22eb93d6, 0xac23eb02494ae8e0, 0xcb8c5ab08163d2a3, - 0x63f822fc21b42698, 0xe7e8814288c470cc, 0x143b07aae2ccd592, - 0xc5d142f4806e13a3, 0xe695de2a1d8a344b, 0xc8ddc3ed542a5988, - 0x60ec526cc1e5274d, 0x732a04dcf34a7ac9, 0xf1daef52096a872a, - 0x541f04b87b3de158, 0xeb143a708b621f94, 0x0849cd39e156b25f, - 0x36eb0746caa62c5c, 0xbfa14eb3c31f78bf, 0x256637f35dc41f20, - 0x08293113693a58e2, 0x064202395a685840, 0x0593285ee1ed42ea, - 0xdcbf16fd8a44f213, 0xe9f5586745f4f23d, 0x66808a2c18365ae9, - 0xa70496836a5166e1, 0xe9ed7d0f9f572246, 0x024ba6063287d0cb, - 0xa441f6ac287479db, 0x72502c190698ee02, 0xb79705c6ced58c29, - 0x5c03f52968cb1fdc, 0x6f4b7c6bed6cc232, 0xed834775697438d3, - 0x6273b075725ffb6f, 0x60df77a69e9aafb2, 0x84483bf48b989c4e, - 0x37e42a1d35795a31, 0x280dcdb36b853ae5, 0x63309d698f2dd42c, - 0x24e65be2c805ea5b, 0x1db08e0d041efdf9, 0xb94aea8c4648772b, - 0x109f2b81aa4660d2, 0xcae92809feb1a390, 0x0a1cbf9628383b41, - 0xca0bf416706fc5c8, 0x9d4751bd7e638488, 0x343b363d5d96c7c7, - 0x6bacaedde1daf5aa, 0x721ead1618c4e405, 0xcfc19e400cb6dbc6, - 0x7ac0dd9128ec8cc3, 0xb7dd428bb44fc744, + 0xe3de56d839559570, 0x77afbb8906ccf19a, 0xe66e02ad8e92c12f, + 0x36cc6f2fe751bc64, 0xf4d28aea3ed69ade, 0xf246fd6a4ca33ab3, + 0xcae9edaf829a6575, 0x11fa7c88bf4cc73a, 0x98c47bfbae3d8c02, + 0x27545ea34451a56f, 0x14b4c1a27d26d2b2, 0x7613ca9857e7cbe0, + 0x02efc8f12e71bb88, 0x9c1a83672eee7d5b, 0x2b89aa9cfde6d9b3, + 0xabeb2cf68ab463dc, 0xfb3db51d62d71d28, 0x3177cc3e0927e344, + 0xb5798fc1d348beb6, 0x212c471510859095, 0x191df651c270548d, + 0x1b15d6b8e8cc3105, 0xdc51a9e369fbab13, 0xb674fd0372fab5aa, + 0x048aec21666fc8d5, 0x8b15383bc4a7e244, 0xf8945c253a43469f, + 0x513f31e6bf240064, 0xc37c6225063c266f, 0x2a6747f8952ec44c, + 0xa7a0f11d3607f268, 0x9b58d5c889166fb3, 0xb75ca76a67212a39, + 0x9a71a476bf4933f3, 0x3f049a71b475edf7, 0x7a49c8907a278e5e, + 0xf9a78757d9355ac9, 0xbe1bd9d70ee46398, 0x600e24a76a359a39, + 0x235905ade793a5f0, 0x940c36e0df8ec13a, 0x9300c551cb93a286, + 0x5e265b90e4828aba, 0x42dee1626c647735, 0x8cd32910c1f17a8c, + 0x642f44f3f8290fb0, 0xb73ee167c5f8655e, 0x9d05deadb2a47fe4, + 0x03b359dff5351708, 0x19a6427c0ee9c907, 0x0692dc026847e0d9, + 0xe4023ba608b25f1e, 0xf438ad7eea71dfc1, 0x205120c9355d4ac8, + 0xd4cf2cb1f412c846, 0x82d7d98b87ecf53a, 0x2cc779469cc3d690, + 0xf93545883141ce8e, 0x0f794c869b82a28a, 0x014e9fccfe7f7d9e, + 0xb6953652e49e4e0a, 0x68035eabb056e9b6, 0xf1b3a6847e610274, + 0x59dc584137c0cb55, 0xafb36238ad797ec6, 0xb9f331777a030104, + 0x2899a97f98140d3b, 0x0ae342d8d4f80ad0, 0xcfb25544c873fbd1, + 0xcb2498ca84b01a4c, 0x1a50e4a0d8db8406, 0x0afbf05b7c8c146e, + 0x1b45ee2ee670d71b, 0xef9204d9fcc31075, 0x2a5750d2ef558495, + 0x8e151888d0ce024e, 0x26882404dd58bcc9, 0x6c830f947bf2195f, + 0x2ff0c1fa64bf8cfb, 0x8f108f869e11d224, 0x7ce757787a04fd76, + 0xee55da944ebbbd85, 0x43563645bb00dcd0, 0x643848bf73336681, + 0x050f54d0478ffd52, 0xc5c1bff1bc3c008c, 0x48fd0f2729c9402d, + 0x7685b990e3e264af, 0x940dccac3264ed16, 0x688f94ee88c9ba90, + 0xca112c7825f3944b, 0x584cb5ddba130be2, 0xb7ced01f7140dff6, + 0xc10dcbb4e77168e6, 0xb5ea4360351ebaef, }; #else constexpr uint64_t kGolden[kNumGoldenOutputs] = {