Neon: Use byte-swap builtins instead of inline asm

Define compiler-independent byte-swap macros and use them instead of
executing 'rev' via inline assembly code with GCC-compatible compilers
or a slow shift-store sequence with Visual C++.

* This produces identical assembly code with:

  - 64-bit GCC 8.4.0 (Linux)
  - 64-bit GCC 9.3.0 (Linux)
  - 64-bit Clang 10.0.0 (Linux)
  - 64-bit Clang 10.0.0 (MinGW)
  - 64-bit Clang 12.0.0 (Xcode 12.2, macOS)
  - 64-bit Clang 12.0.0 (Xcode 12.2, iOS)

* This produces different assembly code with:

  - 64-bit GCC 4.9.1 (Linux)
  - 32-bit GCC 4.8.2 (Linux)
  - 32-bit GCC 8.4.0 (Linux)
  - 32-bit GCC 9.3.0 (Linux)
    Since the intrinsics implementation of Huffman encoding is not used
    by default with these compilers, this is not a concern.

  - 32-bit Clang 10.0.0 (Linux)
    Verified performance neutrality

Closes #507
diff --git a/simd/arm/jchuff.h b/simd/arm/jchuff.h
index d30759f..d4edd5e 100644
--- a/simd/arm/jchuff.h
+++ b/simd/arm/jchuff.h
@@ -6,7 +6,7 @@
  * libjpeg-turbo Modifications:
  * Copyright (C) 2009, 2018, D. R. Commander.
  * Copyright (C) 2018, Matthias Räncker.
- * Copyright (C) 2020, Arm Limited.
+ * Copyright (C) 2020-2021, Arm Limited.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  */
@@ -56,24 +56,6 @@
  */
 #if defined(__aarch64__) || defined(_M_ARM64)
 
-#if defined(_MSC_VER) && !defined(__clang__)
-#define SPLAT() { \
-  buffer[0] = (JOCTET)(put_buffer >> 56); \
-  buffer[1] = (JOCTET)(put_buffer >> 48); \
-  buffer[2] = (JOCTET)(put_buffer >> 40); \
-  buffer[3] = (JOCTET)(put_buffer >> 32); \
-  buffer[4] = (JOCTET)(put_buffer >> 24); \
-  buffer[5] = (JOCTET)(put_buffer >> 16); \
-  buffer[6] = (JOCTET)(put_buffer >>  8); \
-  buffer[7] = (JOCTET)(put_buffer      ); \
-}
-#else
-#define SPLAT() { \
-  __asm__("rev %x0, %x1" : "=r"(put_buffer) : "r"(put_buffer)); \
-  *((uint64_t *)buffer) = put_buffer; \
-}
-#endif
-
 #define FLUSH() { \
   if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
     EMIT_BYTE(put_buffer >> 56) \
@@ -85,27 +67,13 @@
     EMIT_BYTE(put_buffer >>  8) \
     EMIT_BYTE(put_buffer      ) \
   } else { \
-    SPLAT() \
+    *((uint64_t *)buffer) = BUILTIN_BSWAP64(put_buffer); \
     buffer += 8; \
   } \
 }
 
 #else
 
-#if defined(_MSC_VER) && !defined(__clang__)
-#define SPLAT() { \
-  buffer[0] = (JOCTET)(put_buffer >> 24); \
-  buffer[1] = (JOCTET)(put_buffer >> 16); \
-  buffer[2] = (JOCTET)(put_buffer >>  8); \
-  buffer[3] = (JOCTET)(put_buffer      ); \
-}
-#else
-#define SPLAT() { \
-  __asm__("rev %0, %1" : "=r"(put_buffer) : "r"(put_buffer)); \
-  *((uint32_t *)buffer) = put_buffer; \
-}
-#endif
-
 #define FLUSH() { \
   if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
     EMIT_BYTE(put_buffer >> 24) \
@@ -113,7 +81,7 @@
     EMIT_BYTE(put_buffer >>  8) \
     EMIT_BYTE(put_buffer      ) \
   } else { \
-    SPLAT() \
+    *((uint32_t *)buffer) = BUILTIN_BSWAP32(put_buffer); \
     buffer += 4; \
   } \
 }
diff --git a/simd/arm/neon-compat.h.in b/simd/arm/neon-compat.h.in
index 23d6d28..436c402 100644
--- a/simd/arm/neon-compat.h.in
+++ b/simd/arm/neon-compat.h.in
@@ -23,13 +23,17 @@
 #cmakedefine HAVE_VLD1_U16_X2
 #cmakedefine HAVE_VLD1Q_U8_X4
 
-/* Define compiler-independent count-leading-zeros macros */
+/* Define compiler-independent count-leading-zeros and byte-swap macros */
 #if defined(_MSC_VER) && !defined(__clang__)
 #define BUILTIN_CLZ(x)  _CountLeadingZeros(x)
 #define BUILTIN_CLZLL(x)  _CountLeadingZeros64(x)
+#define BUILTIN_BSWAP32(x)  _byteswap_ulong(x)
+#define BUILTIN_BSWAP64(x)  _byteswap_uint64(x)
 #elif defined(__clang__) || defined(__GNUC__)
 #define BUILTIN_CLZ(x)  __builtin_clz(x)
 #define BUILTIN_CLZLL(x)  __builtin_clzll(x)
+#define BUILTIN_BSWAP32(x)  __builtin_bswap32(x)
+#define BUILTIN_BSWAP64(x)  __builtin_bswap64(x)
 #else
 #error "Unknown compiler"
 #endif