Neon: Use byte-swap builtins instead of inline asm
Define compiler-independent byte-swap macros and use them instead of
executing 'rev' via inline assembly code with GCC-compatible compilers
or a slow shift-store sequence with Visual C++.
* This produces identical assembly code with:
- 64-bit GCC 8.4.0 (Linux)
- 64-bit GCC 9.3.0 (Linux)
- 64-bit Clang 10.0.0 (Linux)
- 64-bit Clang 10.0.0 (MinGW)
- 64-bit Clang 12.0.0 (Xcode 12.2, macOS)
- 64-bit Clang 12.0.0 (Xcode 12.2, iOS)
* This produces different assembly code with:
- 64-bit GCC 4.9.1 (Linux)
- 32-bit GCC 4.8.2 (Linux)
- 32-bit GCC 8.4.0 (Linux)
- 32-bit GCC 9.3.0 (Linux)
Since the intrinsics implementation of Huffman encoding is not used
by default with these compilers, this is not a concern.
- 32-bit Clang 10.0.0 (Linux)
Verified performance neutrality
Closes #507
diff --git a/simd/arm/jchuff.h b/simd/arm/jchuff.h
index d30759f..d4edd5e 100644
--- a/simd/arm/jchuff.h
+++ b/simd/arm/jchuff.h
@@ -6,7 +6,7 @@
* libjpeg-turbo Modifications:
* Copyright (C) 2009, 2018, D. R. Commander.
* Copyright (C) 2018, Matthias Räncker.
- * Copyright (C) 2020, Arm Limited.
+ * Copyright (C) 2020-2021, Arm Limited.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*/
@@ -56,24 +56,6 @@
*/
#if defined(__aarch64__) || defined(_M_ARM64)
-#if defined(_MSC_VER) && !defined(__clang__)
-#define SPLAT() { \
- buffer[0] = (JOCTET)(put_buffer >> 56); \
- buffer[1] = (JOCTET)(put_buffer >> 48); \
- buffer[2] = (JOCTET)(put_buffer >> 40); \
- buffer[3] = (JOCTET)(put_buffer >> 32); \
- buffer[4] = (JOCTET)(put_buffer >> 24); \
- buffer[5] = (JOCTET)(put_buffer >> 16); \
- buffer[6] = (JOCTET)(put_buffer >> 8); \
- buffer[7] = (JOCTET)(put_buffer ); \
-}
-#else
-#define SPLAT() { \
- __asm__("rev %x0, %x1" : "=r"(put_buffer) : "r"(put_buffer)); \
- *((uint64_t *)buffer) = put_buffer; \
-}
-#endif
-
#define FLUSH() { \
if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
EMIT_BYTE(put_buffer >> 56) \
@@ -85,27 +67,13 @@
EMIT_BYTE(put_buffer >> 8) \
EMIT_BYTE(put_buffer ) \
} else { \
- SPLAT() \
+ *((uint64_t *)buffer) = BUILTIN_BSWAP64(put_buffer); \
buffer += 8; \
} \
}
#else
-#if defined(_MSC_VER) && !defined(__clang__)
-#define SPLAT() { \
- buffer[0] = (JOCTET)(put_buffer >> 24); \
- buffer[1] = (JOCTET)(put_buffer >> 16); \
- buffer[2] = (JOCTET)(put_buffer >> 8); \
- buffer[3] = (JOCTET)(put_buffer ); \
-}
-#else
-#define SPLAT() { \
- __asm__("rev %0, %1" : "=r"(put_buffer) : "r"(put_buffer)); \
- *((uint32_t *)buffer) = put_buffer; \
-}
-#endif
-
#define FLUSH() { \
if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
EMIT_BYTE(put_buffer >> 24) \
@@ -113,7 +81,7 @@
EMIT_BYTE(put_buffer >> 8) \
EMIT_BYTE(put_buffer ) \
} else { \
- SPLAT() \
+ *((uint32_t *)buffer) = BUILTIN_BSWAP32(put_buffer); \
buffer += 4; \
} \
}
diff --git a/simd/arm/neon-compat.h.in b/simd/arm/neon-compat.h.in
index 23d6d28..436c402 100644
--- a/simd/arm/neon-compat.h.in
+++ b/simd/arm/neon-compat.h.in
@@ -23,13 +23,17 @@
#cmakedefine HAVE_VLD1_U16_X2
#cmakedefine HAVE_VLD1Q_U8_X4
-/* Define compiler-independent count-leading-zeros macros */
+/* Define compiler-independent count-leading-zeros and byte-swap macros */
#if defined(_MSC_VER) && !defined(__clang__)
#define BUILTIN_CLZ(x) _CountLeadingZeros(x)
#define BUILTIN_CLZLL(x) _CountLeadingZeros64(x)
+#define BUILTIN_BSWAP32(x) _byteswap_ulong(x)
+#define BUILTIN_BSWAP64(x) _byteswap_uint64(x)
#elif defined(__clang__) || defined(__GNUC__)
#define BUILTIN_CLZ(x) __builtin_clz(x)
#define BUILTIN_CLZLL(x) __builtin_clzll(x)
+#define BUILTIN_BSWAP32(x) __builtin_bswap32(x)
+#define BUILTIN_BSWAP64(x) __builtin_bswap64(x)
#else
#error "Unknown compiler"
#endif