Refactor GroupSse2Impl to improve codegen for hashtable lookups on x86_64.

Currently, clang isn't using blsr to clear the least significant bit because we are using uint16_t as the bitmask integer type and there are only blsr instructions for 32 and 64 bit registers. When we change to use uint32_t, clang generates blsr, but it also adds an unnecessary movzx (zero-extending move) even though the high bits of the output of vpmovmskb are zeroed. In order to avoid this problem, we use inline asm.

PiperOrigin-RevId: 854341062
Change-Id: I38b7e8913bf76ffd8e94b081afb52128fbcb795d
diff --git a/absl/container/internal/hashtable_control_bytes.h b/absl/container/internal/hashtable_control_bytes.h
index f0fd354..70016e1 100644
--- a/absl/container/internal/hashtable_control_bytes.h
+++ b/absl/container/internal/hashtable_control_bytes.h
@@ -84,7 +84,7 @@
  public:
   explicit NonIterableBitMask(T mask) : mask_(mask) {}
 
-  explicit operator bool() const { return this->mask_ != 0; }
+  explicit operator bool() const { return mask_ != 0; }
 
   // Returns the index of the lowest *abstract* bit set in `self`.
   uint32_t LowestBitSet() const {
@@ -274,8 +274,12 @@
 
 struct GroupSse2Impl {
   static constexpr size_t kWidth = 16;  // the number of slots per group
-  using BitMaskType = BitMask<uint16_t, kWidth>;
-  using NonIterableBitMaskType = NonIterableBitMask<uint16_t, kWidth>;
+  // There are only 16 bits, but using uint32_t instead of uint16_t allows for
+  // better codegen. In particular, there is no blsr instruction for a 16 bit
+  // register, but there is for a 32 bit register (used in BitMask::operator++).
+  using MaskInt = uint32_t;
+  using BitMaskType = BitMask<MaskInt, kWidth>;
+  using NonIterableBitMaskType = NonIterableBitMask<MaskInt, kWidth>;
 
   explicit GroupSse2Impl(const ctrl_t* pos) {
     ctrl = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos));
@@ -284,42 +288,35 @@
   // Returns a bitmask representing the positions of slots that match hash.
   BitMaskType Match(h2_t hash) const {
     auto match = _mm_set1_epi8(static_cast<char>(hash));
-    return BitMaskType(
-        static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(match, ctrl))));
+    return BitMaskType(MoveMask(_mm_cmpeq_epi8(match, ctrl)));
   }
 
   // Returns a bitmask representing the positions of empty slots.
   NonIterableBitMaskType MaskEmpty() const {
 #ifdef ABSL_INTERNAL_HAVE_SSSE3
     // This only works because ctrl_t::kEmpty is -128.
-    return NonIterableBitMaskType(
-        static_cast<uint16_t>(_mm_movemask_epi8(_mm_sign_epi8(ctrl, ctrl))));
+    return NonIterableBitMaskType(MoveMask(_mm_sign_epi8(ctrl, ctrl)));
 #else
     auto match = _mm_set1_epi8(static_cast<char>(ctrl_t::kEmpty));
-    return NonIterableBitMaskType(
-        static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(match, ctrl))));
+    return NonIterableBitMaskType(MoveMask(_mm_cmpeq_epi8(match, ctrl)));
 #endif
   }
 
   // Returns a bitmask representing the positions of full slots.
   // Note: for `is_small()` tables group may contain the "same" slot twice:
   // original and mirrored.
-  BitMaskType MaskFull() const {
-    return BitMaskType(static_cast<uint16_t>(_mm_movemask_epi8(ctrl) ^ 0xffff));
-  }
+  BitMaskType MaskFull() const { return BitMaskType(MoveMask(ctrl) ^ 0xffff); }
 
   // Returns a bitmask representing the positions of non full slots.
   // Note: this includes: kEmpty, kDeleted, kSentinel.
   // It is useful in contexts when kSentinel is not present.
-  auto MaskNonFull() const {
-    return BitMaskType(static_cast<uint16_t>(_mm_movemask_epi8(ctrl)));
-  }
+  auto MaskNonFull() const { return BitMaskType(MoveMask(ctrl)); }
 
   // Returns a bitmask representing the positions of empty or deleted slots.
   NonIterableBitMaskType MaskEmptyOrDeleted() const {
     auto special = _mm_set1_epi8(static_cast<char>(ctrl_t::kSentinel));
-    return NonIterableBitMaskType(static_cast<uint16_t>(
-        _mm_movemask_epi8(_mm_cmpgt_epi8_fixed(special, ctrl))));
+    return NonIterableBitMaskType(
+        MoveMask(_mm_cmpgt_epi8_fixed(special, ctrl)));
   }
 
   // Returns a bitmask representing the positions of full or sentinel slots.
@@ -327,8 +324,8 @@
   // original and mirrored.
   NonIterableBitMaskType MaskFullOrSentinel() const {
     auto special = _mm_set1_epi8(static_cast<char>(ctrl_t::kSentinel) - 1);
-    return NonIterableBitMaskType(static_cast<uint16_t>(
-        _mm_movemask_epi8(_mm_cmpgt_epi8_fixed(ctrl, special))));
+    return NonIterableBitMaskType(
+        MoveMask(_mm_cmpgt_epi8_fixed(ctrl, special)));
   }
 
   void ConvertSpecialToEmptyAndFullToDeleted(ctrl_t* dst) const {
@@ -344,6 +341,17 @@
     _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), res);
   }
 
+  static MaskInt MoveMask(__m128i xmm) {
+    auto mask = static_cast<MaskInt>(_mm_movemask_epi8(xmm));
+#ifdef __clang__
+    // TODO(b/472522597): Without the inline asm, clang ends up generating an
+    // unnecessary movzx to zero the upper bits of the output, but those bits
+    // are already zero. See https://godbolt.org/z/G6xW1Ecbx.
+    asm("" : "+r"(mask));  // NOLINT
+#endif
+    return mask;
+  }
+
   __m128i ctrl;
 };
 #endif  // ABSL_INTERNAL_RAW_HASH_SET_HAVE_SSE2