Change PrecombineLengthMix to sample data from kStaticRandomData. The idea is based on Carbon hashing's SampleRandomData(). This also fixes an issue that we found when adding debug asserts to detect cases of long probe sequences in absl hash tables. In that case, the keys were length=29 strings with all the entropy in the 2nd and 3rd bytes, and we saw many collisions due to low entropy in the last byte of the hash. It seems that this was caused by the input seed having low entropy, and doing the length mixing this way solves that issue. Also: - Fix a bug in HashtableDebugAccess::GetNumProbes. - Move the DoubleRange test from flat_hash_set_test.cc to hash_test.cc since it's a test for hash quality. - Fix using declaration style in hash_test.cc to comply with Google C++ style guide. PiperOrigin-RevId: 774891965 Change-Id: I22f18a0288e9bfb2734f7b8c6e788b623a1db0f5

commit: 4e5beaf3b8bdfd2f80aba4e448cb1f659b3b7504 [log] [tgz]
author: Evan Brown <ezb@google.com> Mon Jun 23 12:52:24 2025 -0700
committer: Copybara-Service <copybara-worker@google.com> Mon Jun 23 12:53:12 2025 -0700
tree: 675180f9d9d39c04926bd08295f7c56bc95145fb
parent: 2b320cbfaa4e6e6ee0d04ab120e1224eabb349a6 [diff]
diff --git a/absl/container/flat_hash_set_test.cc b/absl/container/flat_hash_set_test.cc
index f2c3fbd..ca069b4 100644
--- a/absl/container/flat_hash_set_test.cc
+++ b/absl/container/flat_hash_set_test.cc

@@ -397,21 +397,6 @@
             false);
 }
 
-// Test that we don't cause excessive collisions on the hash table for
-// doubles in the range [-1024, 1024]. See cl/773069881 for more information.
-TEST(FlatHashSet, DoubleRange) {
-  using absl::container_internal::hashtable_debug_internal::
-      HashtableDebugAccess;
-  absl::flat_hash_set<double> set;
-  for (double t = -1024.0; t < 1024.0; t += 1.0) {
-    set.insert(t);
-  }
-  for (double t = -1024.0; t < 1024.0; t += 1.0) {
-    ASSERT_LT(HashtableDebugAccess<decltype(set)>::GetNumProbes(set, t), 64)
-        << t;
-  }
-}
-
 }  // namespace
 }  // namespace container_internal
 ABSL_NAMESPACE_END

diff --git a/absl/container/internal/raw_hash_set.h b/absl/container/internal/raw_hash_set.h
index 08c5d57..94a3249 100644
--- a/absl/container/internal/raw_hash_set.h
+++ b/absl/container/internal/raw_hash_set.h

@@ -3713,7 +3713,7 @@
 
   static size_t GetNumProbes(const Set& set,
                              const typename Set::key_type& key) {
-    if (set.is_soo()) return 0;
+    if (set.is_small()) return 0;
     size_t num_probes = 0;
     const size_t hash = set.hash_of(key);
     auto seq = probe(set.common(), hash);

diff --git a/absl/hash/hash_test.cc b/absl/hash/hash_test.cc
index 85e34c9..0ad92cf 100644
--- a/absl/hash/hash_test.cc
+++ b/absl/hash/hash_test.cc

@@ -58,17 +58,17 @@
 
 namespace {
 
+using ::absl::Hash;
+using ::absl::container_internal::hashtable_debug_internal::
+    HashtableDebugAccess;
+using ::absl::hash_internal::SpyHashState;
 using ::absl::hash_test_internal::is_hashable;
 using ::absl::hash_test_internal::TypeErasedContainer;
 using ::absl::hash_test_internal::TypeErasedValue;
-using ::testing::SizeIs;
 
 template <typename T>
 using TypeErasedVector = TypeErasedContainer<std::vector<T>>;
 
-using absl::Hash;
-using absl::hash_internal::SpyHashState;
-
 template <typename T>
 class HashValueIntTest : public testing::Test {
 };
@@ -1239,4 +1239,50 @@
   EXPECT_NE(absl::HashOf(-1.0), absl::HashOf(1.0));
 }
 
+// Test that we don't cause excessive collisions on the hash table for
+// doubles in the range [-1024, 1024]. See cl/773069881 for more information.
+TEST(SwisstableCollisions, DoubleRange) {
+#ifdef GOOGLE_UNSUPPORTED_OS_LOONIX
+  // TODO(b/424834054): make this test pass on Loonix.
+  GTEST_SKIP() << "Test fails on Loonix.";
+#endif
+
+  absl::flat_hash_set<double> set;
+  for (double t = -1024.0; t < 1024.0; t += 1.0) {
+    set.insert(t);
+    ASSERT_LT(HashtableDebugAccess<decltype(set)>::GetNumProbes(set, t), 64)
+        << t;
+  }
+}
+
+// Test that for each pair of adjacent bytes in a string, if there's only
+// entropy in those two bytes, then we don't have excessive collisions.
+TEST(SwisstableCollisions, LowEntropyStrings) {
+  if (sizeof(size_t) < 8) {
+    // TODO(b/424834054): make this test pass on 32-bit platforms. We need to
+    // make 32-bit Mix() stronger.
+    GTEST_SKIP() << "Test fails on 32-bit platforms";
+  }
+
+  constexpr char kMinChar = 0;
+  constexpr char kMaxChar = 64;
+  // These sizes cover the different hashing cases.
+  for (size_t size : {8u, 16u, 32u, 64u}) {
+    for (size_t b = 0; b < size - 1; ++b) {
+      absl::flat_hash_set<std::string> set;
+      std::string s(size, '\0');
+      for (char c1 = kMinChar; c1 < kMaxChar; ++c1) {
+        for (char c2 = kMinChar; c2 < kMaxChar; ++c2) {
+          s[b] = c1;
+          s[b + 1] = c2;
+          set.insert(s);
+          ASSERT_LT(HashtableDebugAccess<decltype(set)>::GetNumProbes(set, s),
+                    64)
+              << size << " " << b;
+        }
+      }
+    }
+  }
+}
+
 }  // namespace

diff --git a/absl/hash/internal/hash.cc b/absl/hash/internal/hash.cc
index 87d2061..8dcc5bd 100644
--- a/absl/hash/internal/hash.cc
+++ b/absl/hash/internal/hash.cc

@@ -100,10 +100,10 @@
 ABSL_ATTRIBUTE_ALWAYS_INLINE inline uint64_t HashBlockOn32Bit(
     const unsigned char* data, size_t len, uint64_t state) {
   // TODO(b/417141985): expose and use CityHash32WithSeed.
-  return Mix(
-      PrecombineLengthMix(state, len) ^
-          hash_internal::CityHash32(reinterpret_cast<const char*>(data), len),
-      kMul);
+  // Note: we can't use PrecombineLengthMix here because len can be up to 1024.
+  return Mix((state + len) ^ hash_internal::CityHash32(
+                                 reinterpret_cast<const char*>(data), len),
+             kMul);
 }
 
 ABSL_ATTRIBUTE_NOINLINE uint64_t

diff --git a/absl/hash/internal/hash.h b/absl/hash/internal/hash.h
index dc37405..0b4f8be 100644
--- a/absl/hash/internal/hash.h
+++ b/absl/hash/internal/hash.h

@@ -935,21 +935,6 @@
                     hash_internal::WeaklyMixedInteger{size});
 }
 
-// Extremely weak mixture of length that is added to the state before combining
-// the data. It is used only for small strings.
-inline uint64_t PrecombineLengthMix(uint64_t state, size_t len) {
-  // The length is always one byte here. We place it to 4th byte for the
-  // following reasons:
-  // 1. 4th byte is unused for very short strings 0-3 bytes.
-  // 2. 4th byte is duplicated for 4 bytes string.
-  // 3. 4th byte is in the middle and mixed well for 5-8 bytes strings.
-  //
-  // There were experiments with adding just `len` here.
-  // Also seems have slightly better performance overall, that gives collisions
-  // for small strings.
-  return state + (uint64_t{len} << 24);
-}
-
  inline constexpr uint64_t kMul = uint64_t{0xdcb22ca68cb134ed};
 
 // Random data taken from the hexadecimal digits of Pi's fractional component.
@@ -959,6 +944,16 @@
     0x082e'fa98'ec4e'6c89, 0x4528'21e6'38d0'1377,
 };
 
+// Extremely weak mixture of length that is mixed into the state before
+// combining the data. It is used only for small strings. This also ensures that
+// we have high entropy in all bits of the state.
+inline uint64_t PrecombineLengthMix(uint64_t state, size_t len) {
+  ABSL_ASSUME(len + sizeof(uint64_t) <= sizeof(kStaticRandomData));
+  uint64_t data = absl::base_internal::UnalignedLoad64(
+      reinterpret_cast<const unsigned char*>(&kStaticRandomData[0]) + len);
+  return state ^ data;
+}
+
 ABSL_ATTRIBUTE_ALWAYS_INLINE inline uint64_t Mix(uint64_t lhs, uint64_t rhs) {
   // For 32 bit platforms we are trying to use all 64 lower bits.
   if constexpr (sizeof(size_t) < 8) {
commit	4e5beaf3b8bdfd2f80aba4e448cb1f659b3b7504	[log] [tgz]
author	Evan Brown <ezb@google.com>	Mon Jun 23 12:52:24 2025 -0700
committer	Copybara-Service <copybara-worker@google.com>	Mon Jun 23 12:53:12 2025 -0700
tree	675180f9d9d39c04926bd08295f7c56bc95145fb
parent	2b320cbfaa4e6e6ee0d04ab120e1224eabb349a6 [diff]