Optimzie crc32 on AMD Milan+

We have AVX encoded vector PCLMULQDQ on Milan, so use it to make
crc32c computations ~10% faster. We need to use inline asm, since
building this twice with different complier flags for dynamic
dispatch performed worse due to missing inlining.

BM_Calculate/0                  1.136n ±  0%    1.136n ±  1%        ~ (p=0.968 n=6)
BM_Calculate/1                  1.420n ±  0%    1.421n ±  1%        ~ (p=0.870 n=6)
BM_Calculate/100                9.089n ±  0%    9.660n ±  1%   +6.29% (p=0.002 n=6)
BM_Calculate/2048               75.30n ±  1%    67.67n ±  1%  -10.13% (p=0.002 n=6)
BM_Calculate/10000              313.1n ±  0%    286.1n ±  0%   -8.63% (p=0.002 n=6)
BM_Calculate/500000             14.91µ ±  4%    13.49µ ±  1%   -9.48% (p=0.002 n=6)
BM_Extend/0                     1.136n ±  1%    1.136n ±  1%        ~ (p=0.636 n=6)
BM_Extend/1                     1.420n ±  0%    1.420n ±  1%        ~ (p=0.636 n=6)
BM_Extend/100                   9.247n ±  2%    9.800n ±  2%   +5.99% (p=0.002 n=6)
BM_Extend/2048                  75.73n ±  1%    67.37n ±  1%  -11.04% (p=0.002 n=6)
BM_Extend/10000                 313.2n ±  1%    286.2n ±  0%   -8.62% (p=0.002 n=6)
BM_Extend/500000                14.87µ ±  1%    13.57µ ±  1%   -8.74% (p=0.002 n=6)
BM_Extend/100000000             3.185m ±  2%    2.816m ±  3%  -11.60% (p=0.002 n=6)
BM_ExtendCacheMiss/10           26.07m ±  1%    26.06m ±  1%        ~ (p=1.000 n=6)
BM_ExtendCacheMiss/100          13.86m ±  4%    14.36m ±  2%   +3.61% (p=0.026 n=6)
BM_ExtendCacheMiss/1000         27.02m ±  4%    27.28m ±  4%        ~ (p=0.699 n=6)
BM_ExtendCacheMiss/100000       5.114m ±  5%    4.600m ±  8%  -10.07% (p=0.002 n=6)
BM_ExtendByZeroes/1             1.420n ±  0%    1.420n ±  0%        ~ (p=0.670 n=12)
BM_ExtendByZeroes/10            1.704n ±  1%    1.704n ±  0%        ~ (p=1.000 n=6)
BM_ExtendByZeroes/100           3.128n ±  0%    3.128n ±  0%        ~ (p=1.000 n=6)
BM_ExtendByZeroes/1000          6.758n ±  0%    6.638n ±  1%   -1.78% (p=0.002 n=6)
BM_ExtendByZeroes/10000         6.619n ±  1%    6.503n ±  0%   -1.75% (p=0.002 n=6)
BM_ExtendByZeroes/100000        8.537n ±  1%    8.479n ±  0%   -0.67% (p=0.019 n=6)
BM_ExtendByZeroes/1000000       9.766n ±  1%    9.692n ±  1%   -0.75% (p=0.002 n=6)

PiperOrigin-RevId: 900870516
Change-Id: I1382ae2ffeed35e1d55a0916290144cae5256fe0
diff --git a/absl/crc/internal/crc32_x86_arm_combined_simd.h b/absl/crc/internal/crc32_x86_arm_combined_simd.h
index 5a9b61a..322ec42 100644
--- a/absl/crc/internal/crc32_x86_arm_combined_simd.h
+++ b/absl/crc/internal/crc32_x86_arm_combined_simd.h
@@ -15,6 +15,7 @@
 #ifndef ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
 #define ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
 
+#include <array>
 #include <cstdint>
 
 #include "absl/base/config.h"
@@ -65,6 +66,13 @@
 using V128 = __m128i;
 #endif
 
+#if defined(__AVX__)
+using V256 = __m256i;
+#else
+// Placeholder for V256 when AVX is not available.
+using V256 = std::array<uint64_t, 4>;
+#endif
+
 // Starting with the initial value in |crc|, accumulates a CRC32 value for
 // unsigned integers of different sizes.
 uint32_t CRC32_u8(uint32_t crc, uint8_t v);
@@ -119,6 +127,17 @@
 // Add packed 64-bit integers in |l| and |r|.
 V128 V128_Add64(const V128 l, const V128 r);
 
+#if defined(__AVX__)
+inline V256 V256_LoadU(const V256* src);
+inline V256 V256_Broadcast128(const V128* src);
+#else
+template <typename T = V256>
+T V256_LoadU(const T* src);
+
+template <typename T = V256>
+T V256_Broadcast128(const V128* src);
+#endif
+
 #endif
 
 #if defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
@@ -271,6 +290,26 @@
 
 #endif
 
+#if defined(__AVX__)
+inline V256 V256_LoadU(const V256* src) { return _mm256_loadu_si256(src); }
+
+inline V256 V256_Broadcast128(const V128* src) {
+  return _mm256_castps_si256(
+      _mm256_broadcast_ps(reinterpret_cast<const __m128*>(src)));
+}
+#elif defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD) || \
+    defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
+template <typename T>
+inline T V256_LoadU(const T* src) {
+  return T{};
+}
+
+template <typename T>
+inline T V256_Broadcast128(const V128* src) {
+  return T{};
+}
+#endif
+
 }  // namespace crc_internal
 ABSL_NAMESPACE_END
 }  // namespace absl
diff --git a/absl/crc/internal/crc_x86_arm_combined.cc b/absl/crc/internal/crc_x86_arm_combined.cc
index ebd9c3f..b36b4d6 100644
--- a/absl/crc/internal/crc_x86_arm_combined.cc
+++ b/absl/crc/internal/crc_x86_arm_combined.cc
@@ -357,6 +357,74 @@
     crc[2] = crc2;
   }
 
+#if defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD) && defined(__AVX__)
+  // This is only used if we have vector version of PCLMULQDQ.
+  // We don't have it on arm, and it isn't supported by default
+  // compiler targets on x86. If we want to use it, we need to either use
+  // new compiler flags for the whole function and compile it twice
+  // with new and default flags or use inline asm.
+  // The code below is the same as FinalizePclmulStream, but with
+  // PCLMUL and XOR operating on 2 values in a vector at the same time.
+  ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t
+  FinalizeVpclmulStream(V256* partialCRC) const {
+    uint64_t crc = 0;
+    uint64_t low64, high64;
+    __asm__(
+        // reduce 2 256-bit vectors into s single 256 vector
+        "vbroadcasti128 %[k256], %%ymm0 \n"
+        "vpclmulqdq $0x00, %%ymm0, %[crc0], %%ymm1 \n"
+        "vpclmulqdq $0x11, %%ymm0, %[crc0], %%ymm2 \n"
+        "vpxor %%ymm2, %%ymm1, %%ymm1 \n"
+        "vpxor %[crc1], %%ymm1, %%ymm1 \n"
+        // reduce upper and lower parts of 256-bit vector
+        "vextracti128 $1, %%ymm1, %%xmm2 \n"
+        "vpclmulqdq $0x00, %[k128], %%xmm1, %%xmm3 \n"
+        "vpclmulqdq $0x11, %[k128], %%xmm1, %%xmm1 \n"
+        "vpxor %%xmm1, %%xmm3, %%xmm3 \n"
+        "vpxor %%xmm2, %%xmm3, %%xmm3 \n"
+        // Move 2 parts of 128-bit vector into scalar register
+        // and reduce using sacalr crc instruction
+        "vmovq %%xmm3, %[low] \n"
+        "vpextrq $1, %%xmm3, %[high] \n"
+        "crc32q %[low], %[crc_out] \n"
+        "crc32q %[high], %[crc_out] \n"
+        : [crc_out] "+r"(crc), [low] "=&r"(low64), [high] "=&r"(high64)
+        : [k256] "m"(*(const __m128i*)kFoldAcross256Bits),
+          [crc0] "x"(partialCRC[0]), [crc1] "x"(partialCRC[1]),
+          [k128] "m"(*(const __m128i*)kFoldAcross128Bits)
+        : "ymm0", "ymm1", "ymm2", "ymm3");
+    return crc;
+  }
+
+  ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesVpclmul(
+      const uint8_t* p, V256* vpartialCRC, V256 loopMultiplicands) const {
+    __asm__ volatile(
+        "vmovdqu (%2), %%ymm0 \n"
+        "vmovdqu 32(%2), %%ymm1 \n"
+        "vpclmulqdq $0x11, %3, %0, %%ymm2 \n"
+        "vpclmulqdq $0x11, %3, %1, %%ymm3 \n"
+        "vpclmulqdq $0x00, %3, %0, %0 \n"
+        "vpclmulqdq $0x00, %3, %1, %1 \n"
+        "vpxor %%ymm2, %0, %0 \n"
+        "vpxor %%ymm3, %1, %1 \n"
+        "vpxor %%ymm0, %0, %0 \n"
+        "vpxor %%ymm1, %1, %1 \n"
+        : "+x"(vpartialCRC[0]), "+x"(vpartialCRC[1])
+        : "r"(p), "x"(loopMultiplicands)
+        : "ymm0", "ymm1", "ymm2", "ymm3");
+  }
+#else
+  template <typename T = V256>
+  ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesVpclmul(
+      const uint8_t* p, T* vpartialCRC, T loopMultiplicands) const {
+    static_assert(sizeof(T) == 0, "Vector PCLMUL not supported");
+  }
+  ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t
+  FinalizeVpclmulStream(V256* partialCRC) const {
+    return 0;
+  }
+#endif  // defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD) && defined(__AVX__)
+
   // Constants generated by './scripts/gen-crc-consts.py x86_pclmul
   // crc32_lsb_0x82f63b78' from the Linux kernel.
   alignas(16) static constexpr uint64_t kFoldAcross512Bits[2] = {
@@ -386,7 +454,7 @@
 };
 
 template <size_t num_crc_streams, size_t num_pclmul_streams,
-          CutoffStrategy strategy>
+          size_t num_vpclmul_streams, CutoffStrategy strategy>
 class CRC32AcceleratedX86ARMCombinedMultipleStreams
     : public CRC32AcceleratedX86ARMCombinedMultipleStreamsBase {
   ABSL_ATTRIBUTE_HOT
@@ -396,6 +464,9 @@
                   "Invalid number of crc streams");
     static_assert(num_pclmul_streams >= 0 && num_pclmul_streams <= kMaxStreams,
                   "Invalid number of pclmul streams");
+    static_assert(
+        num_vpclmul_streams >= 0 && num_vpclmul_streams <= kMaxStreams,
+        "Invalid number of vpclmul streams");
     const uint8_t* p = static_cast<const uint8_t*>(bytes);
     const uint8_t* e = p + length;
     uint32_t l = *crc;
@@ -474,17 +545,23 @@
       }
 
       size_t bs = static_cast<size_t>(e - p) /
-                  (num_crc_streams + num_pclmul_streams) / 64;
+                  (num_crc_streams + num_pclmul_streams + num_vpclmul_streams) /
+                  64;
+      const uint8_t* stream_start = p;
       const uint8_t* crc_streams[kMaxStreams];
-      const uint8_t* pclmul_streams[kMaxStreams];
-      // We are guaranteed to have at least one crc stream.
-      crc_streams[0] = p;
-      for (size_t i = 1; i < num_crc_streams; i++) {
-        crc_streams[i] = crc_streams[i - 1] + bs * 64;
+      for (size_t i = 0; i < num_crc_streams; i++) {
+        crc_streams[i] = stream_start;
+        stream_start += bs * 64;
       }
-      pclmul_streams[0] = crc_streams[num_crc_streams - 1] + bs * 64;
-      for (size_t i = 1; i < num_pclmul_streams; i++) {
-        pclmul_streams[i] = pclmul_streams[i - 1] + bs * 64;
+      const uint8_t* pclmul_streams[kMaxStreams];
+      for (size_t i = 0; i < num_pclmul_streams; i++) {
+        pclmul_streams[i] = stream_start;
+        stream_start += bs * 64;
+      }
+      const uint8_t* vpclmul_streams[kMaxStreams];
+      for (size_t i = 0; i < num_vpclmul_streams; i++) {
+        vpclmul_streams[i] = stream_start;
+        stream_start += bs * 64;
       }
 
       // Per stream crc sums.
@@ -520,6 +597,18 @@
         pclmul_streams[i] += 16 * 4;
       }
 
+      V256 vpartialCRC[kMaxStreams][2];
+      V256 loopMultiplicands{};
+      loopMultiplicands =
+          V256_Broadcast128(reinterpret_cast<const V128*>(kFoldAcross512Bits));
+      for (size_t i = 0; i < num_vpclmul_streams; i++) {
+        vpartialCRC[i][0] = V256_LoadU(
+            reinterpret_cast<const V256*>(vpclmul_streams[i] + 32 * 0));
+        vpartialCRC[i][1] = V256_LoadU(
+            reinterpret_cast<const V256*>(vpclmul_streams[i] + 32 * 1));
+        vpclmul_streams[i] += 16 * 4;
+      }
+
       for (size_t i = 1; i < bs; i++) {
         // Prefetch data for next iterations.
         for (size_t j = 0; j < num_crc_streams; j++) {
@@ -530,6 +619,10 @@
           PrefetchToLocalCache(reinterpret_cast<const char*>(pclmul_streams[j] +
                                                              kPrefetchHorizon));
         }
+        for (size_t j = 0; j < num_vpclmul_streams; j++) {
+          PrefetchToLocalCache(reinterpret_cast<const char*>(
+              vpclmul_streams[j] + kPrefetchHorizon));
+        }
 
         // We process each stream in 64 byte blocks. This can be written as
         // for (int i = 0; i < num_pclmul_streams; i++) {
@@ -568,6 +661,12 @@
           Process64BytesPclmul(pclmul_streams[2], partialCRC[2]);
           pclmul_streams[2] += 16 * 4;
         }
+
+        if constexpr (num_vpclmul_streams > 0) {
+          Process64BytesVpclmul(vpclmul_streams[0], vpartialCRC[0],
+                                loopMultiplicands);
+          vpclmul_streams[0] += 16 * 4;
+        }
       }
 
       // PCLMULQDQ based streams require special final step;
@@ -576,6 +675,13 @@
         l64_pclmul[i] = FinalizePclmulStream(partialCRC[i]);
       }
 
+      uint64_t l64_vpclmul[kMaxStreams] = {0};
+      if constexpr (num_vpclmul_streams > 0) {
+        for (size_t i = 0; i < num_vpclmul_streams; i++) {
+          l64_vpclmul[i] = FinalizeVpclmulStream(vpartialCRC[i]);
+        }
+      }
+
       // Combine all streams into single result.
       static_assert(64 % (1 << kNumDroppedBits) == 0);
       uint32_t magic = ComputeZeroConstant(bs * 64);
@@ -588,9 +694,15 @@
         l64 = MultiplyWithExtraX33(static_cast<uint32_t>(l64), magic);
         l64 ^= l64_pclmul[i];
       }
+      for (size_t i = 0; i < num_vpclmul_streams; i++) {
+        l64 = MultiplyWithExtraX33(static_cast<uint32_t>(l64), magic);
+        l64 ^= l64_vpclmul[i];
+      }
 
       // Update p.
-      if (num_pclmul_streams > 0) {
+      if constexpr (num_vpclmul_streams > 0) {
+        p = vpclmul_streams[num_vpclmul_streams - 1];
+      } else if constexpr (num_pclmul_streams > 0) {
         p = pclmul_streams[num_pclmul_streams - 1];
       } else {
         p = crc_streams[num_crc_streams - 1];
@@ -618,6 +730,10 @@
       ABSL_INTERNAL_STEP1(l, p);
     }
 
+    *crc = l;
+  }
+};
+
 #undef ABSL_INTERNAL_STEP8BY3
 #undef ABSL_INTERNAL_STEP8BY2
 #undef ABSL_INTERNAL_STEP8
@@ -625,10 +741,6 @@
 #undef ABSL_INTERNAL_STEP2
 #undef ABSL_INTERNAL_STEP1
 
-    *crc = l;
-  }
-};
-
 }  // namespace
 
 // Intel processors with SSE4.2 have an instruction for one particular
@@ -639,11 +751,20 @@
     case CpuType::kIntelHaswell:
     case CpuType::kAmdRome:
     case CpuType::kAmdNaples:
+      return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
+          3, 1, 0, CutoffStrategy::Fold3>();
     case CpuType::kAmdMilan:
     case CpuType::kAmdGenoa:
     case CpuType::kAmdTurin:
+#if defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD) && defined(__AVX__)
+      // We don't have vector pclmul on arm, but this still needs to
+      // compile.
       return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
-          3, 1, CutoffStrategy::Fold3>();
+          3, 0, 1, CutoffStrategy::Fold3>();
+#else
+      return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
+          3, 1, 0, CutoffStrategy::Fold3>();
+#endif
     // PCLMULQDQ is fast, use combined PCLMULQDQ + CRC implementation.
     case CpuType::kIntelCascadelakeXeon:
     case CpuType::kIntelSkylakeXeon:
@@ -654,32 +775,32 @@
     case CpuType::kIntelEmeraldrapids:
     case CpuType::kIntelGraniterapidsap:
       return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
-          3, 2, CutoffStrategy::Fold3>();
+          3, 2, 0, CutoffStrategy::Fold3>();
     // PCLMULQDQ is slow, don't use it.
     case CpuType::kIntelIvybridge:
     case CpuType::kIntelSandybridge:
     case CpuType::kIntelWestmere:
       return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
-          3, 0, CutoffStrategy::Fold3>();
+          3, 0, 0, CutoffStrategy::Fold3>();
     case CpuType::kArmNeoverseN1:
     case CpuType::kArmNeoverseN2:
     case CpuType::kArmNeoverseV1:
     case CpuType::kArmNeoverseN3:
       return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
-          1, 1, CutoffStrategy::Unroll64CRC>();
+          1, 1, 0, CutoffStrategy::Unroll64CRC>();
     case CpuType::kAmpereSiryn:
       return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
-          3, 2, CutoffStrategy::Fold3>();
+          3, 2, 0, CutoffStrategy::Fold3>();
     case CpuType::kArmNeoverseV2:
       return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
-          1, 2, CutoffStrategy::Unroll64CRC>();
+          1, 2, 0, CutoffStrategy::Unroll64CRC>();
 #if defined(__aarch64__)
     default:
       // Not all ARM processors support the needed instructions, so check here
       // before trying to use an accelerated implementation.
       if (SupportsArmCRC32PMULL()) {
         return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
-            1, 1, CutoffStrategy::Unroll64CRC>();
+            1, 1, 0, CutoffStrategy::Unroll64CRC>();
       } else {
         return nullptr;
       }
@@ -687,71 +808,13 @@
     default:
       // Something else, play it safe and assume slow PCLMULQDQ.
       return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
-          3, 0, CutoffStrategy::Fold3>();
+          3, 0, 0, CutoffStrategy::Fold3>();
 #endif
   }
 }
 
-std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() {
-  auto ret = std::vector<std::unique_ptr<CRCImpl>>();
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    1, 0, CutoffStrategy::Fold3>>());
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    1, 1, CutoffStrategy::Fold3>>());
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    1, 2, CutoffStrategy::Fold3>>());
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    1, 3, CutoffStrategy::Fold3>>());
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    2, 0, CutoffStrategy::Fold3>>());
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    2, 1, CutoffStrategy::Fold3>>());
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    2, 2, CutoffStrategy::Fold3>>());
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    2, 3, CutoffStrategy::Fold3>>());
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    3, 0, CutoffStrategy::Fold3>>());
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    3, 1, CutoffStrategy::Fold3>>());
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    3, 2, CutoffStrategy::Fold3>>());
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    3, 3, CutoffStrategy::Fold3>>());
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    1, 0, CutoffStrategy::Unroll64CRC>>());
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    1, 1, CutoffStrategy::Unroll64CRC>>());
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    1, 2, CutoffStrategy::Unroll64CRC>>());
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    1, 3, CutoffStrategy::Unroll64CRC>>());
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    2, 0, CutoffStrategy::Unroll64CRC>>());
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    2, 1, CutoffStrategy::Unroll64CRC>>());
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    2, 2, CutoffStrategy::Unroll64CRC>>());
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    2, 3, CutoffStrategy::Unroll64CRC>>());
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    3, 0, CutoffStrategy::Unroll64CRC>>());
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    3, 1, CutoffStrategy::Unroll64CRC>>());
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    3, 2, CutoffStrategy::Unroll64CRC>>());
-  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
-                    3, 3, CutoffStrategy::Unroll64CRC>>());
-
-  return ret;
-}
-
 #else  // !ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
 
-std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() {
-  return std::vector<std::unique_ptr<CRCImpl>>();
-}
-
 // no hardware acceleration available
 CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() { return nullptr; }