Specialize long strings hash for ARM to benefit from AES instruction differences.

```
name                                     CYCLES/op     CYCLES/op   vs base
BM_HASHING_Combine_contiguous_Fleet_hot  540.0m ±  2%   507.0m ± 2%   -6.11% (p=0.002 n=6)
BM_HASHING_Combine_contiguous_Fleet_cold  2.124 ± 12%    2.027 ± 3%   -4.54% (p=0.041 n=6)
```

ASM diff
1. 33-64: -4 cycles https://godbolt.org/z/nEYEKP4M3
2. 65+: -6 cycles https://godbolt.org/z/xsnjh678c
3. -104 bytes of binary size: https://godbolt.org/z/j8absY8M8

PiperOrigin-RevId: 866058683
Change-Id: I7d2f13532ce4fd2bec0382af0ba116967d5aa063
diff --git a/absl/hash/internal/hash.cc b/absl/hash/internal/hash.cc
index 74b6d57..43cb561 100644
--- a/absl/hash/internal/hash.cc
+++ b/absl/hash/internal/hash.cc
@@ -78,6 +78,9 @@
   return _mm_sub_epi64(a, b);
 }
 
+// Bits of the second argument to Encrypt128/Decrypt128 are XORed with the
+// first argument after encryption/decryption.
+
 inline Vector128 Encrypt128(Vector128 data, Vector128 key) {
   return _mm_aesenc_si128(data, key);
 }
@@ -86,6 +89,28 @@
   return _mm_aesdec_si128(data, key);
 }
 
+// We use each value as the first argument to shuffle all the bits around. We do
+// not add any salt to the state or loaded data, instead we vary instructions
+// used to mix bits Encrypt128/Decrypt128 and Add128/Sub128. On x86,
+// Add128/Sub128 are combined to one instruction with data loading like
+// `vpaddq xmm1, xmm0, xmmword ptr [rdi]`.
+
+inline Vector128 MixA(Vector128 a, Vector128 state) {
+  return Decrypt128(Add128(state, a), state);
+}
+
+inline Vector128 MixB(Vector128 b, Vector128 state) {
+  return Decrypt128(Sub128(state, b), state);
+}
+
+inline Vector128 MixC(Vector128 c, Vector128 state) {
+  return Encrypt128(Add128(state, c), state);
+}
+
+inline Vector128 MixD(Vector128 d, Vector128 state) {
+  return Encrypt128(Sub128(state, d), state);
+}
+
 inline uint64_t ExtractLow64(Vector128 v) {
   return static_cast<uint64_t>(_mm_cvtsi128_si64(v));
 }
@@ -118,37 +143,41 @@
       vaddq_u64(vreinterpretq_u64_u8(a), vreinterpretq_u64_u8(b)));
 }
 
-inline Vector128 Sub128(Vector128 a, Vector128 b) {
-  return vreinterpretq_u8_u64(
-      vsubq_u64(vreinterpretq_u64_u8(a), vreinterpretq_u64_u8(b)));
-}
-
-// Encrypt128 and Decrypt128 are equivalent to x86 versions.
-// `vaeseq_u8` and `vaesdq_u8` perform `^ key` before encryption/decryption.
-// We need to perform `^ key` after encryption/decryption to improve hash
-// quality and match x86 version.
+// Bits of the second argument to Decrypt128/Encrypt128 are XORed with the
+// state argument BEFORE encryption (in x86 version they are XORed after).
 
 inline Vector128 Encrypt128(Vector128 data, Vector128 key) {
-  return vaesmcq_u8(vaeseq_u8(data, Vector128{})) ^ key;
-}
-
-inline Vector128 Decrypt128(Vector128 data, Vector128 key) {
-  return vaesimcq_u8(vaesdq_u8(data, Vector128{})) ^ key;
-}
-
-// ArmEncrypt128 and ArmDecrypt128 are ARM specific versions that use ARM
-// instructions directly.
-// We can only use these versions in the second round of encryption/decryption
-// to have good hash quality.
-
-inline Vector128 ArmEncrypt128(Vector128 data, Vector128 key) {
   return vaesmcq_u8(vaeseq_u8(data, key));
 }
 
-inline Vector128 ArmDecrypt128(Vector128 data, Vector128 key) {
+inline Vector128 Decrypt128(Vector128 data, Vector128 key) {
   return vaesimcq_u8(vaesdq_u8(data, key));
 }
 
+// We use decryption for a, b and encryption for c, d. That helps us to avoid
+// collisions for trivial byte rotations. Mix4x16Vectors later uses
+// encrypted/decrypted pairs differently to ensure that the order of blocks is
+// important for the hash value.
+// We also avoid using Add128/Sub128 instructions because state is being mixed
+// before encryption/decryption. On ARM, there is no fusion of load and add/sub
+// instructions so it is more expensive to use them.
+
+inline Vector128 MixA(Vector128 a, Vector128 state) {
+  return Decrypt128(a, state);
+}
+
+inline Vector128 MixB(Vector128 b, Vector128 state) {
+  return Decrypt128(b, state);
+}
+
+inline Vector128 MixC(Vector128 c, Vector128 state) {
+  return Encrypt128(c, state);
+}
+
+inline Vector128 MixD(Vector128 d, Vector128 state) {
+  return Encrypt128(d, state);
+}
+
 inline uint64_t ExtractLow64(Vector128 v) {
   return vgetq_lane_u64(vreinterpretq_u64_u8(v), 0);
 }
@@ -158,7 +187,7 @@
 }
 
 uint64_t Mix4x16Vectors(Vector128 a, Vector128 b, Vector128 c, Vector128 d) {
-  Vector128 res128 = Add128(ArmEncrypt128(a, c), ArmDecrypt128(b, d));
+  Vector128 res128 = Add128(Encrypt128(a, c), Decrypt128(b, d));
   uint64_t x64 = ExtractLow64(res128);
   uint64_t y64 = ExtractHigh64(res128);
   return x64 ^ y64;
@@ -176,16 +205,10 @@
   Vector128 c = Load128(last32_ptr);
   Vector128 d = Load128(last32_ptr + 16);
 
-  // Bits of the second argument to Decrypt128/Encrypt128 are XORed with the
-  // state argument after encryption. We use each value as the first argument to
-  // shuffle all the bits around. We do not add any salt to the state or loaded
-  // data, instead we vary instructions used to mix bits Decrypt128/Encrypt128
-  // and Add128/Sub128. On x86 Add128/Sub128 are combined to one instruction
-  // with data loading like `vpaddq  xmm1, xmm0, xmmword ptr [rdi]`.
-  Vector128 na = Decrypt128(Add128(state, a), state);
-  Vector128 nb = Decrypt128(Sub128(state, b), state);
-  Vector128 nc = Encrypt128(Add128(state, c), state);
-  Vector128 nd = Encrypt128(Sub128(state, d), state);
+  Vector128 na = MixA(a, state);
+  Vector128 nb = MixB(b, state);
+  Vector128 nc = MixC(c, state);
+  Vector128 nd = MixD(d, state);
 
   // We perform another round of encryption to mix bits between two halves of
   // the input.
@@ -216,15 +239,15 @@
                  &state1](const uint8_t* p) ABSL_ATTRIBUTE_ALWAYS_INLINE {
     Vector128 a = Load128(p);
     Vector128 b = Load128(p + 16);
-    state0 = Decrypt128(Add128(state0, a), state0);
-    state1 = Decrypt128(Sub128(state1, b), state1);
+    state0 = MixA(a, state0);
+    state1 = MixB(b, state1);
   };
   auto mix_cd = [&state2,
                  &state3](const uint8_t* p) ABSL_ATTRIBUTE_ALWAYS_INLINE {
     Vector128 c = Load128(p);
     Vector128 d = Load128(p + 16);
-    state2 = Encrypt128(Add128(state2, c), state2);
-    state3 = Encrypt128(Sub128(state3, d), state3);
+    state2 = MixC(c, state2);
+    state3 = MixD(d, state3);
   };
 
   do {
diff --git a/absl/hash/internal/low_level_hash_test.cc b/absl/hash/internal/low_level_hash_test.cc
index 6476462..f1e2d15 100644
--- a/absl/hash/internal/low_level_hash_test.cc
+++ b/absl/hash/internal/low_level_hash_test.cc
@@ -405,38 +405,38 @@
   };
 #elif (defined(ABSL_INTERNAL_HAVE_ARM_NEON) && defined(__ARM_FEATURE_CRYPTO))
   constexpr uint64_t kGolden[kNumGoldenOutputs] = {
-      0x248c82373f7f0d24, 0x0a4f8cbf55047086, 0x498dd0a4445ed65f,
-      0x6d418b193638ab0f, 0x4112c1ce9142980e, 0x02132db6a189f206,
-      0x36e550ca9139ee44, 0xe0a67fdbd8627314, 0x2b9528fe18f65d1d,
-      0x037d9cf48ca9fd1c, 0xa87222631332ca06, 0x31f35e09af065022,
-      0xacd6f1d29071c8e5, 0x54eb4f229a0d15a4, 0x132c27c6a747e136,
-      0x13d7cb427efe7e2f, 0x71a5d21e00f00dcd, 0xdb909cbf1b0fbb64,
-      0x4dc943ad8901fa5b, 0xacf4b3d41a46feb5, 0x12a37d19c14ffd65,
-      0xdb511011bcdd95b9, 0xd6d75af1c8b86dbd, 0xc65aefdcff941a8e,
-      0xbea311ef212c0306, 0xc49861afe7a6888b, 0x598424b541daf528,
-      0x4cc264fbb57c4640, 0x1a7376211a5e674a, 0xcb82900ad06bf89b,
-      0xd9d6201d685a4971, 0x77ed120a277ce616, 0x9bd5ad973b4d358c,
-      0x880850ff91b6b772, 0xf9d24d40448fce38, 0x870b8ee54a31707d,
-      0x613130fe647bd169, 0x04a0cc02a81989f3, 0x998adbda0ab3f8fa,
-      0x2b28729269102040, 0xbdb9be95e352a6d5, 0xd5e2a55d78bd9fb0,
-      0xef609c2b22eb93d6, 0xac23eb02494ae8e0, 0xcb8c5ab08163d2a3,
-      0x63f822fc21b42698, 0xe7e8814288c470cc, 0x143b07aae2ccd592,
-      0xc5d142f4806e13a3, 0xe695de2a1d8a344b, 0xc8ddc3ed542a5988,
-      0x60ec526cc1e5274d, 0x732a04dcf34a7ac9, 0xf1daef52096a872a,
-      0x541f04b87b3de158, 0xeb143a708b621f94, 0x0849cd39e156b25f,
-      0x36eb0746caa62c5c, 0xbfa14eb3c31f78bf, 0x256637f35dc41f20,
-      0x08293113693a58e2, 0x064202395a685840, 0x0593285ee1ed42ea,
-      0xdcbf16fd8a44f213, 0xe9f5586745f4f23d, 0x66808a2c18365ae9,
-      0xa70496836a5166e1, 0xe9ed7d0f9f572246, 0x024ba6063287d0cb,
-      0xa441f6ac287479db, 0x72502c190698ee02, 0xb79705c6ced58c29,
-      0x5c03f52968cb1fdc, 0x6f4b7c6bed6cc232, 0xed834775697438d3,
-      0x6273b075725ffb6f, 0x60df77a69e9aafb2, 0x84483bf48b989c4e,
-      0x37e42a1d35795a31, 0x280dcdb36b853ae5, 0x63309d698f2dd42c,
-      0x24e65be2c805ea5b, 0x1db08e0d041efdf9, 0xb94aea8c4648772b,
-      0x109f2b81aa4660d2, 0xcae92809feb1a390, 0x0a1cbf9628383b41,
-      0xca0bf416706fc5c8, 0x9d4751bd7e638488, 0x343b363d5d96c7c7,
-      0x6bacaedde1daf5aa, 0x721ead1618c4e405, 0xcfc19e400cb6dbc6,
-      0x7ac0dd9128ec8cc3, 0xb7dd428bb44fc744,
+      0xe3de56d839559570, 0x77afbb8906ccf19a, 0xe66e02ad8e92c12f,
+      0x36cc6f2fe751bc64, 0xf4d28aea3ed69ade, 0xf246fd6a4ca33ab3,
+      0xcae9edaf829a6575, 0x11fa7c88bf4cc73a, 0x98c47bfbae3d8c02,
+      0x27545ea34451a56f, 0x14b4c1a27d26d2b2, 0x7613ca9857e7cbe0,
+      0x02efc8f12e71bb88, 0x9c1a83672eee7d5b, 0x2b89aa9cfde6d9b3,
+      0xabeb2cf68ab463dc, 0xfb3db51d62d71d28, 0x3177cc3e0927e344,
+      0xb5798fc1d348beb6, 0x212c471510859095, 0x191df651c270548d,
+      0x1b15d6b8e8cc3105, 0xdc51a9e369fbab13, 0xb674fd0372fab5aa,
+      0x048aec21666fc8d5, 0x8b15383bc4a7e244, 0xf8945c253a43469f,
+      0x513f31e6bf240064, 0xc37c6225063c266f, 0x2a6747f8952ec44c,
+      0xa7a0f11d3607f268, 0x9b58d5c889166fb3, 0xb75ca76a67212a39,
+      0x9a71a476bf4933f3, 0x3f049a71b475edf7, 0x7a49c8907a278e5e,
+      0xf9a78757d9355ac9, 0xbe1bd9d70ee46398, 0x600e24a76a359a39,
+      0x235905ade793a5f0, 0x940c36e0df8ec13a, 0x9300c551cb93a286,
+      0x5e265b90e4828aba, 0x42dee1626c647735, 0x8cd32910c1f17a8c,
+      0x642f44f3f8290fb0, 0xb73ee167c5f8655e, 0x9d05deadb2a47fe4,
+      0x03b359dff5351708, 0x19a6427c0ee9c907, 0x0692dc026847e0d9,
+      0xe4023ba608b25f1e, 0xf438ad7eea71dfc1, 0x205120c9355d4ac8,
+      0xd4cf2cb1f412c846, 0x82d7d98b87ecf53a, 0x2cc779469cc3d690,
+      0xf93545883141ce8e, 0x0f794c869b82a28a, 0x014e9fccfe7f7d9e,
+      0xb6953652e49e4e0a, 0x68035eabb056e9b6, 0xf1b3a6847e610274,
+      0x59dc584137c0cb55, 0xafb36238ad797ec6, 0xb9f331777a030104,
+      0x2899a97f98140d3b, 0x0ae342d8d4f80ad0, 0xcfb25544c873fbd1,
+      0xcb2498ca84b01a4c, 0x1a50e4a0d8db8406, 0x0afbf05b7c8c146e,
+      0x1b45ee2ee670d71b, 0xef9204d9fcc31075, 0x2a5750d2ef558495,
+      0x8e151888d0ce024e, 0x26882404dd58bcc9, 0x6c830f947bf2195f,
+      0x2ff0c1fa64bf8cfb, 0x8f108f869e11d224, 0x7ce757787a04fd76,
+      0xee55da944ebbbd85, 0x43563645bb00dcd0, 0x643848bf73336681,
+      0x050f54d0478ffd52, 0xc5c1bff1bc3c008c, 0x48fd0f2729c9402d,
+      0x7685b990e3e264af, 0x940dccac3264ed16, 0x688f94ee88c9ba90,
+      0xca112c7825f3944b, 0x584cb5ddba130be2, 0xb7ced01f7140dff6,
+      0xc10dcbb4e77168e6, 0xb5ea4360351ebaef,
   };
 #else
   constexpr uint64_t kGolden[kNumGoldenOutputs] = {