Neon: Auto-detect compiler intrinsics completeness

This allows the Neon intrinsics code to be built successfully (albeit
likely with reduced run-time performance) with Xcode 5.0-6.2
(iOS/AArch64) and Android NDK < r19 (AArch32).  Note that Xcode 5.0-6.2
will not build the Armv8 GAS code without gas-preprocessor.pl, and no
version of Xcode will build the Armv7 GAS code without
gas-preprocessor.pl, so we always use the full Neon intrinsics
implementation by default with macOS and iOS builds.

Auto-detecting the completeness of the compiler's set of Neon intrinsics
also allows us to more intelligently set the default value of
NEON_INTRINSICS, based on the values of HAVE_VLD1*.  This is a
reasonable, albeit imperfect, proxy for whether a compiler has a full
and optimal set of Neon intrinsics.  Specific notes:

  - 64-bit RGB-to-YCbCr color conversion
    does not use any of the intrinsics in question, regresses with GCC
  - 64-bit accurate integer forward DCT
    uses vld1_s16_x3(), regresses with GCC
  - 64-bit Huffman encoding
    uses vld1q_u8_x4(), regresses with GCC
  - 64-bit YCbCr-to-RGB color conversion
    does not use any of the intrinsics in question, regresses with GCC
  - 64-bit accurate integer inverse DCT
    uses vld1_s16_x3(), regresses with GCC
  - 64-bit 4x4 inverse DCT
    uses vld1_s16_x3().  I did not test this algorithm in isolation, so
    it may in fact regress with GCC, but the regression may be hidden by
    the speedup from the new SIMD-accelerated upsampling algorithms.

  - 32-bit RGB-to-YCbCr color conversion:
    uses vld1_u16_x2(), regresses with GCC
  - 32-bit accurate integer forward DCT
    uses vld1_s16_x3(), regression irrelevant because there was no
    previous implementation
  - 32-bit accurate integer inverse DCT
    uses vld1_s16_x3(), regresses with GCC
  - 32-bit fast integer inverse DCT
    does not use any of the intrinsics in question, regresses with GCC
  - 32-bit 4x4 inverse DCT
    uses vld1_s16_x3().  I did not test this algorithm in isolation, so
    it may in fact regress with GCC, but the regression may be hidden by
    the speedup from the new SIMD-accelerated upsampling algorithms.

Presumably when GCC includes a full and optimal set of Neon intrinsics,
the HAVE_VLD1* tests will pass, and the full Neon intrinsics
implementation will be enabled automatically.
diff --git a/BUILDING.md b/BUILDING.md
index 8a19f01..116d0dd 100644
--- a/BUILDING.md
+++ b/BUILDING.md
@@ -402,7 +402,7 @@
 
 ### Armv8 (64-bit)
 
-**Xcode 6.3.x or later required**
+**Xcode 5 or later required, Xcode 6.3.x or later recommended**
 
 The following script demonstrates how to build libjpeg-turbo to run on the
 iPhone 5S/iPad Mini 2/iPad Air and newer.
@@ -434,6 +434,8 @@
 
 ### Armv7 (32-bit)
 
+**NDK r19 or later with Clang recommended**
+
 The following is a general recipe script that can be modified for your specific
 needs.
 
@@ -459,6 +461,8 @@
 
 ### Armv8 (64-bit)
 
+**Clang recommended**
+
 The following is a general recipe script that can be modified for your specific
 needs.
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 49c4f90..e2759e4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -139,9 +139,9 @@
 
 macro(boolean_number var)
   if(${var})
-    set(${var} 1)
+    set(${var} 1 ${ARGN})
   else()
-    set(${var} 0)
+    set(${var} 0 ${ARGN})
   endif()
 endmacro()
 
@@ -548,28 +548,10 @@
 endif()
 
 if(WITH_SIMD)
-  if(CPU_TYPE STREQUAL "arm64" OR CPU_TYPE STREQUAL "arm")
-    # GCC doesn't yet have a full or optimal set of Neon intrinsics, so for
-    # performance reasons, when using GCC, we default to using the older GAS
-    # implementation of the Neon SIMD extensions for certain algorithms.
-    if(CMAKE_COMPILER_IS_GNUCC)
-      set(DEFAULT_NEON_INTRINSICS 0)
-    else()
-      set(DEFAULT_NEON_INTRINSICS 1)
-    endif()
-    option(NEON_INTRINSICS
-      "Because GCC doesn't yet have a full or optimal set of Neon intrinsics, for performance reasons, the default when building libjpeg-turbo with GCC is to continue using the older GAS implementation of the Neon SIMD extensions for certain algorithms.  Setting this option forces the full Neon intrinsics implementation to be used with all compilers.  Unsetting this option forces the hybrid GAS/intrinsics implementation to be used with all compilers."
-      ${DEFAULT_NEON_INTRINSICS})
-    boolean_number(NEON_INTRINSICS)
-    if(NEON_INTRINSICS)
-      add_definitions(-DNEON_INTRINSICS)
-      message(STATUS "Use full Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})")
-    else()
-      message(STATUS "Use partial Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})")
-    endif()
-  endif()
-
   add_subdirectory(simd)
+  if(NEON_INTRINSICS)
+    add_definitions(-DNEON_INTRINSICS)
+  endif()
 elseif(NOT WITH_12BIT)
   message(STATUS "SIMD extensions: None (WITH_SIMD = ${WITH_SIMD})")
 endif()
diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt
index 3636e6f..f3c24ef 100644
--- a/simd/CMakeLists.txt
+++ b/simd/CMakeLists.txt
@@ -213,6 +213,42 @@
 
 elseif(CPU_TYPE STREQUAL "arm64" OR CPU_TYPE STREQUAL "arm")
 
+include(CheckSymbolExists)
+if(BITS EQUAL 32)
+  set(CMAKE_REQUIRED_FLAGS -mfpu=neon)
+endif()
+check_symbol_exists(vld1_s16_x3 arm_neon.h HAVE_VLD1_S16_X3)
+check_symbol_exists(vld1_u16_x2 arm_neon.h HAVE_VLD1_U16_X2)
+check_symbol_exists(vld1q_u8_x4 arm_neon.h HAVE_VLD1Q_U8_X4)
+if(BITS EQUAL 32)
+  unset(CMAKE_REQUIRED_FLAGS)
+endif()
+configure_file(arm/neon-compat.h.in arm/neon-compat.h @ONLY)
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/arm)
+
+# GCC (as of this writing) and some older versions of Clang do not have a full
+# or optimal set of Neon intrinsics, so for performance reasons, when using
+# those compilers, we default to using the older GAS implementation of the Neon
+# SIMD extensions for certain algorithms.  The presence or absence of the three
+# intrinsics we tested above is a reasonable proxy for this.  We always default
+# to using the full Neon intrinsics implementation when building for macOS or
+# iOS, to avoid the need for gas-preprocessor.
+if((HAVE_VLD1_S16_X3 AND HAVE_VLD1_U16_X2 AND HAVE_VLD1Q_U8_X4) OR APPLE)
+  set(DEFAULT_NEON_INTRINSICS 1)
+else()
+  set(DEFAULT_NEON_INTRINSICS 0)
+endif()
+option(NEON_INTRINSICS
+  "Because GCC (as of this writing) and some older versions of Clang do not have a full or optimal set of Neon intrinsics, for performance reasons, the default when building libjpeg-turbo with those compilers is to continue using the older GAS implementation of the Neon SIMD extensions for certain algorithms.  Setting this option forces the full Neon intrinsics implementation to be used with all compilers.  Unsetting this option forces the hybrid GAS/intrinsics implementation to be used with all compilers."
+  ${DEFAULT_NEON_INTRINSICS})
+boolean_number(NEON_INTRINSICS PARENT_SCOPE)
+if(NEON_INTRINSICS)
+  add_definitions(-DNEON_INTRINSICS)
+  message(STATUS "Use full Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})")
+else()
+  message(STATUS "Use partial Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})")
+endif()
+
 set(SIMD_SOURCES arm/jcgray-neon.c arm/jcphuff-neon.c arm/jcsample-neon.c
   arm/jdmerge-neon.c arm/jdsample-neon.c arm/jfdctfst-neon.c
   arm/jidctred-neon.c arm/jquanti-neon.c)
diff --git a/simd/arm/aarch32/jccolext-neon.c b/simd/arm/aarch32/jccolext-neon.c
index 38f90c4..96b44d8 100644
--- a/simd/arm/aarch32/jccolext-neon.c
+++ b/simd/arm/aarch32/jccolext-neon.c
@@ -2,6 +2,7 @@
  * jccolext-neon.c - colorspace conversion (32-bit Arm Neon)
  *
  * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -53,7 +54,7 @@
   JSAMPROW outptr0, outptr1, outptr2;
 
   /* Set up conversion constants. */
-#if defined(__clang__)
+#ifdef HAVE_VLD1_U16_X2
   const uint16x4x2_t consts = vld1_u16_x2(jsimd_rgb_ycc_neon_consts);
 #else
   /* GCC does not currently support the intrinsic vld1_<type>_x2(). */
diff --git a/simd/arm/aarch64/jchuff-neon.c b/simd/arm/aarch64/jchuff-neon.c
index 25ede30..808fa95 100644
--- a/simd/arm/aarch64/jchuff-neon.c
+++ b/simd/arm/aarch64/jchuff-neon.c
@@ -2,6 +2,7 @@
  * jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon)
  *
  * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -32,6 +33,7 @@
 #include "../../jsimd.h"
 #include "../align.h"
 #include "../jchuff.h"
+#include "neon-compat.h"
 
 #include <limits.h>
 
@@ -65,7 +67,7 @@
   uint16_t block_diff[DCTSIZE2];
 
   /* Load lookup table indices for rows of zig-zag ordering. */
-#if defined(__clang__) || defined(_MSC_VER)
+#ifdef HAVE_VLD1Q_U8_X4
   const uint8x16x4_t idx_rows_0123 =
     vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE);
   const uint8x16x4_t idx_rows_4567 =
@@ -87,7 +89,7 @@
 #endif
 
   /* Load 8x8 block of DCT coefficients. */
-#if defined(__clang__) || defined(_MSC_VER)
+#ifdef HAVE_VLD1Q_U8_X4
   const int8x16x4_t tbl_rows_0123 =
     vld1q_s8_x4((int8_t *)(block + 0 * DCTSIZE));
   const int8x16x4_t tbl_rows_4567 =
diff --git a/simd/arm/jccolor-neon.c b/simd/arm/jccolor-neon.c
index 1f8d007..f18ed9e 100644
--- a/simd/arm/jccolor-neon.c
+++ b/simd/arm/jccolor-neon.c
@@ -2,6 +2,7 @@
  * jccolor-neon.c - colorspace conversion (Arm Neon)
  *
  * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -28,6 +29,7 @@
 #include "../../jsimddct.h"
 #include "../jsimd.h"
 #include "align.h"
+#include "neon-compat.h"
 
 #include <arm_neon.h>
 
diff --git a/simd/arm/jfdctint-neon.c b/simd/arm/jfdctint-neon.c
index 5e891e6..ccfc07b 100644
--- a/simd/arm/jfdctint-neon.c
+++ b/simd/arm/jfdctint-neon.c
@@ -2,6 +2,7 @@
  * jfdctint-neon.c - accurate integer FDCT (Arm Neon)
  *
  * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -28,6 +29,7 @@
 #include "../../jsimddct.h"
 #include "../jsimd.h"
 #include "align.h"
+#include "neon-compat.h"
 
 #include <arm_neon.h>
 
@@ -85,7 +87,7 @@
 void jsimd_fdct_islow_neon(DCTELEM *data)
 {
   /* Load DCT constants. */
-#if defined(__clang__) || defined(_MSC_VER)
+#ifdef HAVE_VLD1_S16_X3
   const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts);
 #else
   /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
diff --git a/simd/arm/jidctint-neon.c b/simd/arm/jidctint-neon.c
index cf4a946..043b652 100644
--- a/simd/arm/jidctint-neon.c
+++ b/simd/arm/jidctint-neon.c
@@ -30,6 +30,7 @@
 #include "../../jsimddct.h"
 #include "../jsimd.h"
 #include "align.h"
+#include "neon-compat.h"
 
 #include <arm_neon.h>
 
@@ -354,7 +355,7 @@
                                                   int16_t *workspace_2)
 {
   /* Load constants for IDCT computation. */
-#if defined(__clang__) || defined(_MSC_VER)
+#ifdef HAVE_VLD1_S16_X3
   const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
 #else
   const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
@@ -481,7 +482,7 @@
                                                  int16_t *workspace_2)
 {
   /* Load constants for IDCT computation. */
-#if defined(__clang__) || defined(_MSC_VER)
+#ifdef HAVE_VLD1_S16_X3
   const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
 #else
   const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
@@ -565,7 +566,7 @@
                                                   unsigned buf_offset)
 {
   /* Load constants for IDCT computation. */
-#if defined(__clang__) || defined(_MSC_VER)
+#ifdef HAVE_VLD1_S16_X3
   const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
 #else
   const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
@@ -712,7 +713,7 @@
                                                  unsigned buf_offset)
 {
   /* Load constants for IDCT computation. */
-#if defined(__clang__) || defined(_MSC_VER)
+#ifdef HAVE_VLD1_S16_X3
   const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
 #else
   const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
diff --git a/simd/arm/jidctred-neon.c b/simd/arm/jidctred-neon.c
index 023eb7d..be9627e 100644
--- a/simd/arm/jidctred-neon.c
+++ b/simd/arm/jidctred-neon.c
@@ -2,6 +2,7 @@
  * jidctred-neon.c - reduced-size IDCT (Arm Neon)
  *
  * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -28,6 +29,7 @@
 #include "../../jsimddct.h"
 #include "../jsimd.h"
 #include "align.h"
+#include "neon-compat.h"
 
 #include <arm_neon.h>
 
@@ -221,7 +223,7 @@
   int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
 
   /* Load constants for IDCT computation. */
-#if defined(__clang__) || defined(_MSC_VER)
+#ifdef HAVE_VLD1_S16_X3
   const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts);
 #else
   /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
diff --git a/simd/arm/neon-compat.h.in b/simd/arm/neon-compat.h.in
new file mode 100644
index 0000000..7a03d81
--- /dev/null
+++ b/simd/arm/neon-compat.h.in
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#cmakedefine HAVE_VLD1_S16_X3
+#cmakedefine HAVE_VLD1_U16_X2
+#cmakedefine HAVE_VLD1Q_U8_X4