Refactor GroupSse2Impl to improve codegen for hashtable lookups on x86_64. Currently, clang isn't using blsr to clear the least significant bit because we are using uint16_t as the bitmask integer type and there are only blsr instructions for 32 and 64 bit registers. When we change to use uint32_t, clang generates blsr, but it also adds an unnecessary movzx (zero-extending move) even though the high bits of the output of vpmovmskb are zeroed. In order to avoid this problem, we use inline asm. PiperOrigin-RevId: 854341062 Change-Id: I38b7e8913bf76ffd8e94b081afb52128fbcb795d
diff --git a/absl/container/internal/hashtable_control_bytes.h b/absl/container/internal/hashtable_control_bytes.h index f0fd354..70016e1 100644 --- a/absl/container/internal/hashtable_control_bytes.h +++ b/absl/container/internal/hashtable_control_bytes.h
@@ -84,7 +84,7 @@ public: explicit NonIterableBitMask(T mask) : mask_(mask) {} - explicit operator bool() const { return this->mask_ != 0; } + explicit operator bool() const { return mask_ != 0; } // Returns the index of the lowest *abstract* bit set in `self`. uint32_t LowestBitSet() const { @@ -274,8 +274,12 @@ struct GroupSse2Impl { static constexpr size_t kWidth = 16; // the number of slots per group - using BitMaskType = BitMask<uint16_t, kWidth>; - using NonIterableBitMaskType = NonIterableBitMask<uint16_t, kWidth>; + // There are only 16 bits, but using uint32_t instead of uint16_t allows for + // better codegen. In particular, there is no blsr instruction for a 16 bit + // register, but there is for a 32 bit register (used in BitMask::operator++). + using MaskInt = uint32_t; + using BitMaskType = BitMask<MaskInt, kWidth>; + using NonIterableBitMaskType = NonIterableBitMask<MaskInt, kWidth>; explicit GroupSse2Impl(const ctrl_t* pos) { ctrl = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); @@ -284,42 +288,35 @@ // Returns a bitmask representing the positions of slots that match hash. BitMaskType Match(h2_t hash) const { auto match = _mm_set1_epi8(static_cast<char>(hash)); - return BitMaskType( - static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(match, ctrl)))); + return BitMaskType(MoveMask(_mm_cmpeq_epi8(match, ctrl))); } // Returns a bitmask representing the positions of empty slots. NonIterableBitMaskType MaskEmpty() const { #ifdef ABSL_INTERNAL_HAVE_SSSE3 // This only works because ctrl_t::kEmpty is -128. - return NonIterableBitMaskType( - static_cast<uint16_t>(_mm_movemask_epi8(_mm_sign_epi8(ctrl, ctrl)))); + return NonIterableBitMaskType(MoveMask(_mm_sign_epi8(ctrl, ctrl))); #else auto match = _mm_set1_epi8(static_cast<char>(ctrl_t::kEmpty)); - return NonIterableBitMaskType( - static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(match, ctrl)))); + return NonIterableBitMaskType(MoveMask(_mm_cmpeq_epi8(match, ctrl))); #endif } // Returns a bitmask representing the positions of full slots. // Note: for `is_small()` tables group may contain the "same" slot twice: // original and mirrored. - BitMaskType MaskFull() const { - return BitMaskType(static_cast<uint16_t>(_mm_movemask_epi8(ctrl) ^ 0xffff)); - } + BitMaskType MaskFull() const { return BitMaskType(MoveMask(ctrl) ^ 0xffff); } // Returns a bitmask representing the positions of non full slots. // Note: this includes: kEmpty, kDeleted, kSentinel. // It is useful in contexts when kSentinel is not present. - auto MaskNonFull() const { - return BitMaskType(static_cast<uint16_t>(_mm_movemask_epi8(ctrl))); - } + auto MaskNonFull() const { return BitMaskType(MoveMask(ctrl)); } // Returns a bitmask representing the positions of empty or deleted slots. NonIterableBitMaskType MaskEmptyOrDeleted() const { auto special = _mm_set1_epi8(static_cast<char>(ctrl_t::kSentinel)); - return NonIterableBitMaskType(static_cast<uint16_t>( - _mm_movemask_epi8(_mm_cmpgt_epi8_fixed(special, ctrl)))); + return NonIterableBitMaskType( + MoveMask(_mm_cmpgt_epi8_fixed(special, ctrl))); } // Returns a bitmask representing the positions of full or sentinel slots. @@ -327,8 +324,8 @@ // original and mirrored. NonIterableBitMaskType MaskFullOrSentinel() const { auto special = _mm_set1_epi8(static_cast<char>(ctrl_t::kSentinel) - 1); - return NonIterableBitMaskType(static_cast<uint16_t>( - _mm_movemask_epi8(_mm_cmpgt_epi8_fixed(ctrl, special)))); + return NonIterableBitMaskType( + MoveMask(_mm_cmpgt_epi8_fixed(ctrl, special))); } void ConvertSpecialToEmptyAndFullToDeleted(ctrl_t* dst) const { @@ -344,6 +341,17 @@ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), res); } + static MaskInt MoveMask(__m128i xmm) { + auto mask = static_cast<MaskInt>(_mm_movemask_epi8(xmm)); +#ifdef __clang__ + // TODO(b/472522597): Without the inline asm, clang ends up generating an + // unnecessary movzx to zero the upper bits of the output, but those bits + // are already zero. See https://godbolt.org/z/G6xW1Ecbx. + asm("" : "+r"(mask)); // NOLINT +#endif + return mask; + } + __m128i ctrl; }; #endif // ABSL_INTERNAL_RAW_HASH_SET_HAVE_SSE2