Optimzie crc32 on AMD Milan+ We have AVX encoded vector PCLMULQDQ on Milan, so use it to make crc32c computations ~10% faster. We need to use inline asm, since building this twice with different complier flags for dynamic dispatch performed worse due to missing inlining. BM_Calculate/0 1.136n ± 0% 1.136n ± 1% ~ (p=0.968 n=6) BM_Calculate/1 1.420n ± 0% 1.421n ± 1% ~ (p=0.870 n=6) BM_Calculate/100 9.089n ± 0% 9.660n ± 1% +6.29% (p=0.002 n=6) BM_Calculate/2048 75.30n ± 1% 67.67n ± 1% -10.13% (p=0.002 n=6) BM_Calculate/10000 313.1n ± 0% 286.1n ± 0% -8.63% (p=0.002 n=6) BM_Calculate/500000 14.91µ ± 4% 13.49µ ± 1% -9.48% (p=0.002 n=6) BM_Extend/0 1.136n ± 1% 1.136n ± 1% ~ (p=0.636 n=6) BM_Extend/1 1.420n ± 0% 1.420n ± 1% ~ (p=0.636 n=6) BM_Extend/100 9.247n ± 2% 9.800n ± 2% +5.99% (p=0.002 n=6) BM_Extend/2048 75.73n ± 1% 67.37n ± 1% -11.04% (p=0.002 n=6) BM_Extend/10000 313.2n ± 1% 286.2n ± 0% -8.62% (p=0.002 n=6) BM_Extend/500000 14.87µ ± 1% 13.57µ ± 1% -8.74% (p=0.002 n=6) BM_Extend/100000000 3.185m ± 2% 2.816m ± 3% -11.60% (p=0.002 n=6) BM_ExtendCacheMiss/10 26.07m ± 1% 26.06m ± 1% ~ (p=1.000 n=6) BM_ExtendCacheMiss/100 13.86m ± 4% 14.36m ± 2% +3.61% (p=0.026 n=6) BM_ExtendCacheMiss/1000 27.02m ± 4% 27.28m ± 4% ~ (p=0.699 n=6) BM_ExtendCacheMiss/100000 5.114m ± 5% 4.600m ± 8% -10.07% (p=0.002 n=6) BM_ExtendByZeroes/1 1.420n ± 0% 1.420n ± 0% ~ (p=0.670 n=12) BM_ExtendByZeroes/10 1.704n ± 1% 1.704n ± 0% ~ (p=1.000 n=6) BM_ExtendByZeroes/100 3.128n ± 0% 3.128n ± 0% ~ (p=1.000 n=6) BM_ExtendByZeroes/1000 6.758n ± 0% 6.638n ± 1% -1.78% (p=0.002 n=6) BM_ExtendByZeroes/10000 6.619n ± 1% 6.503n ± 0% -1.75% (p=0.002 n=6) BM_ExtendByZeroes/100000 8.537n ± 1% 8.479n ± 0% -0.67% (p=0.019 n=6) BM_ExtendByZeroes/1000000 9.766n ± 1% 9.692n ± 1% -0.75% (p=0.002 n=6) PiperOrigin-RevId: 900870516 Change-Id: I1382ae2ffeed35e1d55a0916290144cae5256fe0
diff --git a/absl/crc/internal/crc32_x86_arm_combined_simd.h b/absl/crc/internal/crc32_x86_arm_combined_simd.h index 5a9b61a..322ec42 100644 --- a/absl/crc/internal/crc32_x86_arm_combined_simd.h +++ b/absl/crc/internal/crc32_x86_arm_combined_simd.h
@@ -15,6 +15,7 @@ #ifndef ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_ #define ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_ +#include <array> #include <cstdint> #include "absl/base/config.h" @@ -65,6 +66,13 @@ using V128 = __m128i; #endif +#if defined(__AVX__) +using V256 = __m256i; +#else +// Placeholder for V256 when AVX is not available. +using V256 = std::array<uint64_t, 4>; +#endif + // Starting with the initial value in |crc|, accumulates a CRC32 value for // unsigned integers of different sizes. uint32_t CRC32_u8(uint32_t crc, uint8_t v); @@ -119,6 +127,17 @@ // Add packed 64-bit integers in |l| and |r|. V128 V128_Add64(const V128 l, const V128 r); +#if defined(__AVX__) +inline V256 V256_LoadU(const V256* src); +inline V256 V256_Broadcast128(const V128* src); +#else +template <typename T = V256> +T V256_LoadU(const T* src); + +template <typename T = V256> +T V256_Broadcast128(const V128* src); +#endif + #endif #if defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD) @@ -271,6 +290,26 @@ #endif +#if defined(__AVX__) +inline V256 V256_LoadU(const V256* src) { return _mm256_loadu_si256(src); } + +inline V256 V256_Broadcast128(const V128* src) { + return _mm256_castps_si256( + _mm256_broadcast_ps(reinterpret_cast<const __m128*>(src))); +} +#elif defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD) || \ + defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) +template <typename T> +inline T V256_LoadU(const T* src) { + return T{}; +} + +template <typename T> +inline T V256_Broadcast128(const V128* src) { + return T{}; +} +#endif + } // namespace crc_internal ABSL_NAMESPACE_END } // namespace absl
diff --git a/absl/crc/internal/crc_x86_arm_combined.cc b/absl/crc/internal/crc_x86_arm_combined.cc index ebd9c3f..b36b4d6 100644 --- a/absl/crc/internal/crc_x86_arm_combined.cc +++ b/absl/crc/internal/crc_x86_arm_combined.cc
@@ -357,6 +357,74 @@ crc[2] = crc2; } +#if defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD) && defined(__AVX__) + // This is only used if we have vector version of PCLMULQDQ. + // We don't have it on arm, and it isn't supported by default + // compiler targets on x86. If we want to use it, we need to either use + // new compiler flags for the whole function and compile it twice + // with new and default flags or use inline asm. + // The code below is the same as FinalizePclmulStream, but with + // PCLMUL and XOR operating on 2 values in a vector at the same time. + ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t + FinalizeVpclmulStream(V256* partialCRC) const { + uint64_t crc = 0; + uint64_t low64, high64; + __asm__( + // reduce 2 256-bit vectors into s single 256 vector + "vbroadcasti128 %[k256], %%ymm0 \n" + "vpclmulqdq $0x00, %%ymm0, %[crc0], %%ymm1 \n" + "vpclmulqdq $0x11, %%ymm0, %[crc0], %%ymm2 \n" + "vpxor %%ymm2, %%ymm1, %%ymm1 \n" + "vpxor %[crc1], %%ymm1, %%ymm1 \n" + // reduce upper and lower parts of 256-bit vector + "vextracti128 $1, %%ymm1, %%xmm2 \n" + "vpclmulqdq $0x00, %[k128], %%xmm1, %%xmm3 \n" + "vpclmulqdq $0x11, %[k128], %%xmm1, %%xmm1 \n" + "vpxor %%xmm1, %%xmm3, %%xmm3 \n" + "vpxor %%xmm2, %%xmm3, %%xmm3 \n" + // Move 2 parts of 128-bit vector into scalar register + // and reduce using sacalr crc instruction + "vmovq %%xmm3, %[low] \n" + "vpextrq $1, %%xmm3, %[high] \n" + "crc32q %[low], %[crc_out] \n" + "crc32q %[high], %[crc_out] \n" + : [crc_out] "+r"(crc), [low] "=&r"(low64), [high] "=&r"(high64) + : [k256] "m"(*(const __m128i*)kFoldAcross256Bits), + [crc0] "x"(partialCRC[0]), [crc1] "x"(partialCRC[1]), + [k128] "m"(*(const __m128i*)kFoldAcross128Bits) + : "ymm0", "ymm1", "ymm2", "ymm3"); + return crc; + } + + ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesVpclmul( + const uint8_t* p, V256* vpartialCRC, V256 loopMultiplicands) const { + __asm__ volatile( + "vmovdqu (%2), %%ymm0 \n" + "vmovdqu 32(%2), %%ymm1 \n" + "vpclmulqdq $0x11, %3, %0, %%ymm2 \n" + "vpclmulqdq $0x11, %3, %1, %%ymm3 \n" + "vpclmulqdq $0x00, %3, %0, %0 \n" + "vpclmulqdq $0x00, %3, %1, %1 \n" + "vpxor %%ymm2, %0, %0 \n" + "vpxor %%ymm3, %1, %1 \n" + "vpxor %%ymm0, %0, %0 \n" + "vpxor %%ymm1, %1, %1 \n" + : "+x"(vpartialCRC[0]), "+x"(vpartialCRC[1]) + : "r"(p), "x"(loopMultiplicands) + : "ymm0", "ymm1", "ymm2", "ymm3"); + } +#else + template <typename T = V256> + ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesVpclmul( + const uint8_t* p, T* vpartialCRC, T loopMultiplicands) const { + static_assert(sizeof(T) == 0, "Vector PCLMUL not supported"); + } + ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t + FinalizeVpclmulStream(V256* partialCRC) const { + return 0; + } +#endif // defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD) && defined(__AVX__) + // Constants generated by './scripts/gen-crc-consts.py x86_pclmul // crc32_lsb_0x82f63b78' from the Linux kernel. alignas(16) static constexpr uint64_t kFoldAcross512Bits[2] = { @@ -386,7 +454,7 @@ }; template <size_t num_crc_streams, size_t num_pclmul_streams, - CutoffStrategy strategy> + size_t num_vpclmul_streams, CutoffStrategy strategy> class CRC32AcceleratedX86ARMCombinedMultipleStreams : public CRC32AcceleratedX86ARMCombinedMultipleStreamsBase { ABSL_ATTRIBUTE_HOT @@ -396,6 +464,9 @@ "Invalid number of crc streams"); static_assert(num_pclmul_streams >= 0 && num_pclmul_streams <= kMaxStreams, "Invalid number of pclmul streams"); + static_assert( + num_vpclmul_streams >= 0 && num_vpclmul_streams <= kMaxStreams, + "Invalid number of vpclmul streams"); const uint8_t* p = static_cast<const uint8_t*>(bytes); const uint8_t* e = p + length; uint32_t l = *crc; @@ -474,17 +545,23 @@ } size_t bs = static_cast<size_t>(e - p) / - (num_crc_streams + num_pclmul_streams) / 64; + (num_crc_streams + num_pclmul_streams + num_vpclmul_streams) / + 64; + const uint8_t* stream_start = p; const uint8_t* crc_streams[kMaxStreams]; - const uint8_t* pclmul_streams[kMaxStreams]; - // We are guaranteed to have at least one crc stream. - crc_streams[0] = p; - for (size_t i = 1; i < num_crc_streams; i++) { - crc_streams[i] = crc_streams[i - 1] + bs * 64; + for (size_t i = 0; i < num_crc_streams; i++) { + crc_streams[i] = stream_start; + stream_start += bs * 64; } - pclmul_streams[0] = crc_streams[num_crc_streams - 1] + bs * 64; - for (size_t i = 1; i < num_pclmul_streams; i++) { - pclmul_streams[i] = pclmul_streams[i - 1] + bs * 64; + const uint8_t* pclmul_streams[kMaxStreams]; + for (size_t i = 0; i < num_pclmul_streams; i++) { + pclmul_streams[i] = stream_start; + stream_start += bs * 64; + } + const uint8_t* vpclmul_streams[kMaxStreams]; + for (size_t i = 0; i < num_vpclmul_streams; i++) { + vpclmul_streams[i] = stream_start; + stream_start += bs * 64; } // Per stream crc sums. @@ -520,6 +597,18 @@ pclmul_streams[i] += 16 * 4; } + V256 vpartialCRC[kMaxStreams][2]; + V256 loopMultiplicands{}; + loopMultiplicands = + V256_Broadcast128(reinterpret_cast<const V128*>(kFoldAcross512Bits)); + for (size_t i = 0; i < num_vpclmul_streams; i++) { + vpartialCRC[i][0] = V256_LoadU( + reinterpret_cast<const V256*>(vpclmul_streams[i] + 32 * 0)); + vpartialCRC[i][1] = V256_LoadU( + reinterpret_cast<const V256*>(vpclmul_streams[i] + 32 * 1)); + vpclmul_streams[i] += 16 * 4; + } + for (size_t i = 1; i < bs; i++) { // Prefetch data for next iterations. for (size_t j = 0; j < num_crc_streams; j++) { @@ -530,6 +619,10 @@ PrefetchToLocalCache(reinterpret_cast<const char*>(pclmul_streams[j] + kPrefetchHorizon)); } + for (size_t j = 0; j < num_vpclmul_streams; j++) { + PrefetchToLocalCache(reinterpret_cast<const char*>( + vpclmul_streams[j] + kPrefetchHorizon)); + } // We process each stream in 64 byte blocks. This can be written as // for (int i = 0; i < num_pclmul_streams; i++) { @@ -568,6 +661,12 @@ Process64BytesPclmul(pclmul_streams[2], partialCRC[2]); pclmul_streams[2] += 16 * 4; } + + if constexpr (num_vpclmul_streams > 0) { + Process64BytesVpclmul(vpclmul_streams[0], vpartialCRC[0], + loopMultiplicands); + vpclmul_streams[0] += 16 * 4; + } } // PCLMULQDQ based streams require special final step; @@ -576,6 +675,13 @@ l64_pclmul[i] = FinalizePclmulStream(partialCRC[i]); } + uint64_t l64_vpclmul[kMaxStreams] = {0}; + if constexpr (num_vpclmul_streams > 0) { + for (size_t i = 0; i < num_vpclmul_streams; i++) { + l64_vpclmul[i] = FinalizeVpclmulStream(vpartialCRC[i]); + } + } + // Combine all streams into single result. static_assert(64 % (1 << kNumDroppedBits) == 0); uint32_t magic = ComputeZeroConstant(bs * 64); @@ -588,9 +694,15 @@ l64 = MultiplyWithExtraX33(static_cast<uint32_t>(l64), magic); l64 ^= l64_pclmul[i]; } + for (size_t i = 0; i < num_vpclmul_streams; i++) { + l64 = MultiplyWithExtraX33(static_cast<uint32_t>(l64), magic); + l64 ^= l64_vpclmul[i]; + } // Update p. - if (num_pclmul_streams > 0) { + if constexpr (num_vpclmul_streams > 0) { + p = vpclmul_streams[num_vpclmul_streams - 1]; + } else if constexpr (num_pclmul_streams > 0) { p = pclmul_streams[num_pclmul_streams - 1]; } else { p = crc_streams[num_crc_streams - 1]; @@ -618,6 +730,10 @@ ABSL_INTERNAL_STEP1(l, p); } + *crc = l; + } +}; + #undef ABSL_INTERNAL_STEP8BY3 #undef ABSL_INTERNAL_STEP8BY2 #undef ABSL_INTERNAL_STEP8 @@ -625,10 +741,6 @@ #undef ABSL_INTERNAL_STEP2 #undef ABSL_INTERNAL_STEP1 - *crc = l; - } -}; - } // namespace // Intel processors with SSE4.2 have an instruction for one particular @@ -639,11 +751,20 @@ case CpuType::kIntelHaswell: case CpuType::kAmdRome: case CpuType::kAmdNaples: + return new CRC32AcceleratedX86ARMCombinedMultipleStreams< + 3, 1, 0, CutoffStrategy::Fold3>(); case CpuType::kAmdMilan: case CpuType::kAmdGenoa: case CpuType::kAmdTurin: +#if defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD) && defined(__AVX__) + // We don't have vector pclmul on arm, but this still needs to + // compile. return new CRC32AcceleratedX86ARMCombinedMultipleStreams< - 3, 1, CutoffStrategy::Fold3>(); + 3, 0, 1, CutoffStrategy::Fold3>(); +#else + return new CRC32AcceleratedX86ARMCombinedMultipleStreams< + 3, 1, 0, CutoffStrategy::Fold3>(); +#endif // PCLMULQDQ is fast, use combined PCLMULQDQ + CRC implementation. case CpuType::kIntelCascadelakeXeon: case CpuType::kIntelSkylakeXeon: @@ -654,32 +775,32 @@ case CpuType::kIntelEmeraldrapids: case CpuType::kIntelGraniterapidsap: return new CRC32AcceleratedX86ARMCombinedMultipleStreams< - 3, 2, CutoffStrategy::Fold3>(); + 3, 2, 0, CutoffStrategy::Fold3>(); // PCLMULQDQ is slow, don't use it. case CpuType::kIntelIvybridge: case CpuType::kIntelSandybridge: case CpuType::kIntelWestmere: return new CRC32AcceleratedX86ARMCombinedMultipleStreams< - 3, 0, CutoffStrategy::Fold3>(); + 3, 0, 0, CutoffStrategy::Fold3>(); case CpuType::kArmNeoverseN1: case CpuType::kArmNeoverseN2: case CpuType::kArmNeoverseV1: case CpuType::kArmNeoverseN3: return new CRC32AcceleratedX86ARMCombinedMultipleStreams< - 1, 1, CutoffStrategy::Unroll64CRC>(); + 1, 1, 0, CutoffStrategy::Unroll64CRC>(); case CpuType::kAmpereSiryn: return new CRC32AcceleratedX86ARMCombinedMultipleStreams< - 3, 2, CutoffStrategy::Fold3>(); + 3, 2, 0, CutoffStrategy::Fold3>(); case CpuType::kArmNeoverseV2: return new CRC32AcceleratedX86ARMCombinedMultipleStreams< - 1, 2, CutoffStrategy::Unroll64CRC>(); + 1, 2, 0, CutoffStrategy::Unroll64CRC>(); #if defined(__aarch64__) default: // Not all ARM processors support the needed instructions, so check here // before trying to use an accelerated implementation. if (SupportsArmCRC32PMULL()) { return new CRC32AcceleratedX86ARMCombinedMultipleStreams< - 1, 1, CutoffStrategy::Unroll64CRC>(); + 1, 1, 0, CutoffStrategy::Unroll64CRC>(); } else { return nullptr; } @@ -687,71 +808,13 @@ default: // Something else, play it safe and assume slow PCLMULQDQ. return new CRC32AcceleratedX86ARMCombinedMultipleStreams< - 3, 0, CutoffStrategy::Fold3>(); + 3, 0, 0, CutoffStrategy::Fold3>(); #endif } } -std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() { - auto ret = std::vector<std::unique_ptr<CRCImpl>>(); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 1, 0, CutoffStrategy::Fold3>>()); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 1, 1, CutoffStrategy::Fold3>>()); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 1, 2, CutoffStrategy::Fold3>>()); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 1, 3, CutoffStrategy::Fold3>>()); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 2, 0, CutoffStrategy::Fold3>>()); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 2, 1, CutoffStrategy::Fold3>>()); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 2, 2, CutoffStrategy::Fold3>>()); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 2, 3, CutoffStrategy::Fold3>>()); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 3, 0, CutoffStrategy::Fold3>>()); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 3, 1, CutoffStrategy::Fold3>>()); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 3, 2, CutoffStrategy::Fold3>>()); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 3, 3, CutoffStrategy::Fold3>>()); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 1, 0, CutoffStrategy::Unroll64CRC>>()); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 1, 1, CutoffStrategy::Unroll64CRC>>()); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 1, 2, CutoffStrategy::Unroll64CRC>>()); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 1, 3, CutoffStrategy::Unroll64CRC>>()); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 2, 0, CutoffStrategy::Unroll64CRC>>()); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 2, 1, CutoffStrategy::Unroll64CRC>>()); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 2, 2, CutoffStrategy::Unroll64CRC>>()); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 2, 3, CutoffStrategy::Unroll64CRC>>()); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 3, 0, CutoffStrategy::Unroll64CRC>>()); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 3, 1, CutoffStrategy::Unroll64CRC>>()); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 3, 2, CutoffStrategy::Unroll64CRC>>()); - ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< - 3, 3, CutoffStrategy::Unroll64CRC>>()); - - return ret; -} - #else // !ABSL_INTERNAL_CAN_USE_SIMD_CRC32C -std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() { - return std::vector<std::unique_ptr<CRCImpl>>(); -} - // no hardware acceleration available CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() { return nullptr; }