Fix Neon SIMD build issues with Visual Studio

- Use the _M_ARM and _M_ARM64 macros provided by Visual Studio for
  compile-time detection of Arm builds, since __arm__ and __aarch64__
  are only present in GNU-compatible compilers.
- Neon/intrinsics: Use the _CountLeadingZeros() and
  _CountLeadingZeros64() intrinsics provided by Visual Studio, since
  __builtin_clz() and __builtin_clzl() are only present in
  GNU-compatible compilers.
- Neon/intrinsics: Since Visual Studio does not support static vector
  initialization, replace static initialization of Neon vectors with the
  appropriate intrinsics.  Compared to the static initialization
  approach, this produces identical assembly code with both GCC and
  Clang.
- Neon/intrinsics: Since Visual Studio does not support inline assembly
  code, provide alternative code paths for Visual Studio whenever inline
  assembly is used.
- Build: Set FLOATTEST appropriately for AArch64 Visual Studio builds
  (Visual Studio does not emit fused multiply-add [FMA] instructions by
  default for such builds.)
- Neon/intrinsics: Move temporary buffer allocation outside of nested
  loops.  Since Visual Studio configures Arm builds with a relatively
  small amount of stack memory, attempting to allocate those buffers
  within the inner loops caused a stack overflow.

Closes #461
Closes #475
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7b881cc..d057d0f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -892,7 +892,7 @@
   endif()
 else()
   if((CPU_TYPE STREQUAL "powerpc" OR CPU_TYPE STREQUAL "arm64") AND
-    NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
+    NOT CMAKE_C_COMPILER_ID STREQUAL "Clang" AND NOT MSVC)
     set(DEFAULT_FLOATTEST fp-contract)
   else()
     set(DEFAULT_FLOATTEST no-fp-contract)
diff --git a/ChangeLog.md b/ChangeLog.md
index 0fe2ae5..d422634 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -135,7 +135,9 @@
 for merged upsampling/color conversion, 1.5.1[5] is no longer necessary and has
 been reverted.
 
-14. The build system can now be used to generate a universal x86-64 + Armv8
+14. The Arm Neon SIMD extensions can now be built using Visual Studio.
+
+15. The build system can now be used to generate a universal x86-64 + Armv8
 libjpeg-turbo SDK package for both iOS and macOS.
 
 
diff --git a/jchuff.c b/jchuff.c
index 2417cac..8ea48b8 100644
--- a/jchuff.c
+++ b/jchuff.c
@@ -7,6 +7,7 @@
  * Copyright (C) 2009-2011, 2014-2016, 2018-2020, D. R. Commander.
  * Copyright (C) 2015, Matthieu Darbois.
  * Copyright (C) 2018, Matthias Räncker.
+ * Copyright (C) 2020, Arm Limited.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -76,7 +77,8 @@
  * intrinsics implementation of the Arm Neon SIMD extensions, which is why we
  * retain the old Huffman encoder behavior when using the GAS implementation.
  */
-#if defined(WITH_SIMD) && !(defined(__arm__) || defined(__aarch64__))
+#if defined(WITH_SIMD) && !(defined(__arm__) || defined(__aarch64__) || \
+                            defined(_M_ARM) || defined(_M_ARM64))
 typedef unsigned long long simd_bit_buf_type;
 #else
 typedef bit_buf_type simd_bit_buf_type;
diff --git a/jdsample.c b/jdsample.c
index 2d34710..eaad72a 100644
--- a/jdsample.c
+++ b/jdsample.c
@@ -477,7 +477,8 @@
     } else if (h_in_group == h_out_group &&
                v_in_group * 2 == v_out_group && do_fancy) {
       /* Non-fancy upsampling is handled by the generic method */
-#if defined(__arm__) || defined(__aarch64__)
+#if defined(__arm__) || defined(__aarch64__) || \
+    defined(_M_ARM) || defined(_M_ARM64)
       if (jsimd_can_h1v2_fancy_upsample())
         upsample->methods[ci] = jsimd_h1v2_fancy_upsample;
       else
diff --git a/simd/arm/aarch32/jccolext-neon.c b/simd/arm/aarch32/jccolext-neon.c
index 96b44d8..362102d 100644
--- a/simd/arm/aarch32/jccolext-neon.c
+++ b/simd/arm/aarch32/jccolext-neon.c
@@ -52,6 +52,8 @@
   JSAMPROW inptr;
   /* Pointers to Y, Cb, and Cr output data */
   JSAMPROW outptr0, outptr1, outptr2;
+  /* Allocate temporary buffer for final (image_width % 8) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
 
   /* Set up conversion constants. */
 #ifdef HAVE_VLD1_U16_X2
@@ -79,7 +81,6 @@
        * buffer large enough to accommodate the vector load.
        */
       if (cols_remaining < 8) {
-        ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
         memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
         inptr = tmp_buf;
       }
diff --git a/simd/arm/aarch32/jchuff-neon.c b/simd/arm/aarch32/jchuff-neon.c
index 941c9b2..19d94f7 100644
--- a/simd/arm/aarch32/jchuff-neon.c
+++ b/simd/arm/aarch32/jchuff-neon.c
@@ -31,6 +31,7 @@
 #include "../../../jsimddct.h"
 #include "../../jsimd.h"
 #include "../jchuff.h"
+#include "neon-compat.h"
 
 #include <limits.h>
 
@@ -231,8 +232,9 @@
   uint8x8_t row6_nbits_gt0 = vcgt_u8(row6_nbits, vdup_n_u8(0));
   uint8x8_t row7_nbits_gt0 = vcgt_u8(row7_nbits, vdup_n_u8(0));
 
+  /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
   const uint8x8_t bitmap_mask =
-    { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
+    vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
 
   row0_nbits_gt0 = vand_u8(row0_nbits_gt0, bitmap_mask);
   row1_nbits_gt0 = vand_u8(row1_nbits_gt0, bitmap_mask);
@@ -278,7 +280,7 @@
   const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
 
   while (bitmap_1_32 != 0) {
-    r = __builtin_clz(bitmap_1_32);
+    r = BUILTIN_CLZ(bitmap_1_32);
     i += r;
     bitmap_1_32 <<= r;
     nbits = block_nbits[i];
@@ -299,7 +301,7 @@
   i = 33;
 
   while (bitmap_33_63 != 0) {
-    unsigned int leading_zeros = __builtin_clz(bitmap_33_63);
+    unsigned int leading_zeros = BUILTIN_CLZ(bitmap_33_63);
     r += leading_zeros;
     i += leading_zeros;
     bitmap_33_63 <<= leading_zeros;
diff --git a/simd/arm/aarch64/jccolext-neon.c b/simd/arm/aarch64/jccolext-neon.c
index 756aeda..37130c2 100644
--- a/simd/arm/aarch64/jccolext-neon.c
+++ b/simd/arm/aarch64/jccolext-neon.c
@@ -51,6 +51,8 @@
   JSAMPROW inptr;
   /* Pointers to Y, Cb, and Cr output data */
   JSAMPROW outptr0, outptr1, outptr2;
+  /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
 
   /* Set up conversion constants. */
   const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
@@ -162,7 +164,6 @@
        * (image_width % 16) columns of data are first memcopied to a temporary
        * buffer large enough to accommodate the vector load.
        */
-      ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
       memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
       inptr = tmp_buf;
 
@@ -255,7 +256,6 @@
        * (image_width % 8) columns of data are first memcopied to a temporary
        * buffer large enough to accommodate the vector load.
        */
-      ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
       memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
       inptr = tmp_buf;
 
diff --git a/simd/arm/aarch64/jchuff-neon.c b/simd/arm/aarch64/jchuff-neon.c
index 808fa95..a0a57a6 100644
--- a/simd/arm/aarch64/jchuff-neon.c
+++ b/simd/arm/aarch64/jchuff-neon.c
@@ -205,8 +205,9 @@
   uint8x8_t abs_row7_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row7),
                                                vdupq_n_u16(0)));
 
+  /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
   const uint8x8_t bitmap_mask =
-    { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
+    vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
 
   abs_row0_gt0 = vand_u8(abs_row0_gt0, bitmap_mask);
   abs_row1_gt0 = vand_u8(abs_row1_gt0, bitmap_mask);
@@ -241,8 +242,12 @@
   /* Encode DC coefficient. */
 
   /* Find nbits required to specify sign and amplitude of coefficient. */
+#if defined(_MSC_VER) && !defined(__clang__)
+  unsigned int lz = BUILTIN_CLZ(vgetq_lane_s16(abs_row0, 0));
+#else
   unsigned int lz;
   __asm__("clz %w0, %w1" : "=r"(lz) : "r"(vgetq_lane_s16(abs_row0, 0)));
+#endif
   unsigned int nbits = 32 - lz;
   /* Emit Huffman-coded symbol and additional diff bits. */
   unsigned int diff = (unsigned int)(vgetq_lane_u16(row0_diff, 0) << lz) >> lz;
@@ -326,7 +331,7 @@
     vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
 
     while (bitmap != 0) {
-      r = __builtin_clzl(bitmap);
+      r = BUILTIN_CLZL(bitmap);
       i += r;
       bitmap <<= r;
       nbits = block_nbits[i];
@@ -365,10 +370,10 @@
 
     /* Same as above but must mask diff bits and compute nbits on demand. */
     while (bitmap != 0) {
-      r = __builtin_clzl(bitmap);
+      r = BUILTIN_CLZL(bitmap);
       i += r;
       bitmap <<= r;
-      lz = __builtin_clz(block_abs[i]);
+      lz = BUILTIN_CLZ(block_abs[i]);
       nbits = 32 - lz;
       diff = (unsigned int)(block_diff[i] << lz) >> lz;
       while (r > 15) {
diff --git a/simd/arm/jccolor-neon.c b/simd/arm/jccolor-neon.c
index f18ed9e..9fcc62d 100644
--- a/simd/arm/jccolor-neon.c
+++ b/simd/arm/jccolor-neon.c
@@ -53,7 +53,7 @@
 
 /* Include inline routines for colorspace extensions. */
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #include "aarch64/jccolext-neon.c"
 #else
 #include "aarch32/jccolext-neon.c"
@@ -68,7 +68,7 @@
 #define RGB_BLUE  EXT_RGB_BLUE
 #define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
 #define jsimd_rgb_ycc_convert_neon  jsimd_extrgb_ycc_convert_neon
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #include "aarch64/jccolext-neon.c"
 #else
 #include "aarch32/jccolext-neon.c"
@@ -84,7 +84,7 @@
 #define RGB_BLUE  EXT_RGBX_BLUE
 #define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
 #define jsimd_rgb_ycc_convert_neon  jsimd_extrgbx_ycc_convert_neon
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #include "aarch64/jccolext-neon.c"
 #else
 #include "aarch32/jccolext-neon.c"
@@ -100,7 +100,7 @@
 #define RGB_BLUE  EXT_BGR_BLUE
 #define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
 #define jsimd_rgb_ycc_convert_neon  jsimd_extbgr_ycc_convert_neon
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #include "aarch64/jccolext-neon.c"
 #else
 #include "aarch32/jccolext-neon.c"
@@ -116,7 +116,7 @@
 #define RGB_BLUE  EXT_BGRX_BLUE
 #define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
 #define jsimd_rgb_ycc_convert_neon  jsimd_extbgrx_ycc_convert_neon
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #include "aarch64/jccolext-neon.c"
 #else
 #include "aarch32/jccolext-neon.c"
@@ -132,7 +132,7 @@
 #define RGB_BLUE  EXT_XBGR_BLUE
 #define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
 #define jsimd_rgb_ycc_convert_neon  jsimd_extxbgr_ycc_convert_neon
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #include "aarch64/jccolext-neon.c"
 #else
 #include "aarch32/jccolext-neon.c"
@@ -148,7 +148,7 @@
 #define RGB_BLUE  EXT_XRGB_BLUE
 #define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
 #define jsimd_rgb_ycc_convert_neon  jsimd_extxrgb_ycc_convert_neon
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #include "aarch64/jccolext-neon.c"
 #else
 #include "aarch32/jccolext-neon.c"
diff --git a/simd/arm/jcgryext-neon.c b/simd/arm/jcgryext-neon.c
index b1f00e6..416a738 100644
--- a/simd/arm/jcgryext-neon.c
+++ b/simd/arm/jcgryext-neon.c
@@ -41,6 +41,8 @@
 {
   JSAMPROW inptr;
   JSAMPROW outptr;
+  /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
 
   while (--num_rows >= 0) {
     inptr = *input_buf++;
@@ -55,7 +57,6 @@
        * buffer large enough to accommodate the vector load.
        */
       if (cols_remaining < 16) {
-        ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
         memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
         inptr = tmp_buf;
       }
diff --git a/simd/arm/jchuff.h b/simd/arm/jchuff.h
index 87ff0d3..d30759f 100644
--- a/simd/arm/jchuff.h
+++ b/simd/arm/jchuff.h
@@ -17,7 +17,7 @@
  * but must not be updated permanently until we complete the MCU.
  */
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #define BIT_BUF_SIZE  64
 #else
 #define BIT_BUF_SIZE  32
@@ -54,7 +54,25 @@
  * directly to the output buffer.  Otherwise, use the EMIT_BYTE() macro to
  * encode 0xFF as 0xFF 0x00.
  */
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#define SPLAT() { \
+  buffer[0] = (JOCTET)(put_buffer >> 56); \
+  buffer[1] = (JOCTET)(put_buffer >> 48); \
+  buffer[2] = (JOCTET)(put_buffer >> 40); \
+  buffer[3] = (JOCTET)(put_buffer >> 32); \
+  buffer[4] = (JOCTET)(put_buffer >> 24); \
+  buffer[5] = (JOCTET)(put_buffer >> 16); \
+  buffer[6] = (JOCTET)(put_buffer >>  8); \
+  buffer[7] = (JOCTET)(put_buffer      ); \
+}
+#else
+#define SPLAT() { \
+  __asm__("rev %x0, %x1" : "=r"(put_buffer) : "r"(put_buffer)); \
+  *((uint64_t *)buffer) = put_buffer; \
+}
+#endif
 
 #define FLUSH() { \
   if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
@@ -67,14 +85,27 @@
     EMIT_BYTE(put_buffer >>  8) \
     EMIT_BYTE(put_buffer      ) \
   } else { \
-    __asm__("rev %x0, %x1" : "=r"(put_buffer) : "r"(put_buffer)); \
-    *((uint64_t *)buffer) = put_buffer; \
+    SPLAT() \
     buffer += 8; \
   } \
 }
 
 #else
 
+#if defined(_MSC_VER) && !defined(__clang__)
+#define SPLAT() { \
+  buffer[0] = (JOCTET)(put_buffer >> 24); \
+  buffer[1] = (JOCTET)(put_buffer >> 16); \
+  buffer[2] = (JOCTET)(put_buffer >>  8); \
+  buffer[3] = (JOCTET)(put_buffer      ); \
+}
+#else
+#define SPLAT() { \
+  __asm__("rev %0, %1" : "=r"(put_buffer) : "r"(put_buffer)); \
+  *((uint32_t *)buffer) = put_buffer; \
+}
+#endif
+
 #define FLUSH() { \
   if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
     EMIT_BYTE(put_buffer >> 24) \
@@ -82,8 +113,7 @@
     EMIT_BYTE(put_buffer >>  8) \
     EMIT_BYTE(put_buffer      ) \
   } else { \
-    __asm__("rev %0, %1" : "=r"(put_buffer) : "r"(put_buffer)); \
-    *((uint32_t *)buffer) = put_buffer; \
+    SPLAT() \
     buffer += 4; \
   } \
 }
diff --git a/simd/arm/jcphuff-neon.c b/simd/arm/jcphuff-neon.c
index 61f94c2..8b6d53b 100644
--- a/simd/arm/jcphuff-neon.c
+++ b/simd/arm/jcphuff-neon.c
@@ -27,6 +27,7 @@
 #include "../../jdct.h"
 #include "../../jsimddct.h"
 #include "../jsimd.h"
+#include "neon-compat.h"
 
 #include <arm_neon.h>
 
@@ -212,8 +213,9 @@
   uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
   uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
 
+  /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
   const uint8x8_t bitmap_mask =
-    { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
+    vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
 
   row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
   row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
@@ -232,7 +234,7 @@
   uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
   uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
   /* Move bitmap to a 64-bit scalar register. */
   uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
   /* Store zerobits bitmap. */
@@ -456,8 +458,9 @@
   uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
   uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
 
+  /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
   const uint8x8_t bitmap_mask =
-    { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
+    vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
 
   abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
   abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
@@ -476,7 +479,7 @@
   uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
   uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
   /* Move bitmap to a 64-bit scalar register. */
   uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
   /* Store zerobits bitmap. */
@@ -517,7 +520,7 @@
   bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
   bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
   /* Move bitmap to a 64-bit scalar register. */
   bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
   /* Store signbits bitmap. */
@@ -560,7 +563,7 @@
   bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
   bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
   /* Move bitmap to a 64-bit scalar register. */
   bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
 
@@ -569,7 +572,7 @@
     /* EOB position is defined to be 0 if all coefficients != 1. */
     return 0;
   } else {
-    return 63 - __builtin_clzl(bitmap);
+    return 63 - BUILTIN_CLZL(bitmap);
   }
 #else
   /* Move bitmap to two 32-bit scalar registers. */
@@ -580,9 +583,9 @@
   if (bitmap0 == 0 && bitmap1 == 0) {
     return 0;
   } else if (bitmap1 != 0) {
-    return 63 - __builtin_clz(bitmap1);
+    return 63 - BUILTIN_CLZ(bitmap1);
   } else {
-    return 31 - __builtin_clz(bitmap0);
+    return 31 - BUILTIN_CLZ(bitmap0);
   }
 #endif
 }
diff --git a/simd/arm/jcsample-neon.c b/simd/arm/jcsample-neon.c
index e4e7827..8a3e237 100644
--- a/simd/arm/jcsample-neon.c
+++ b/simd/arm/jcsample-neon.c
@@ -84,7 +84,8 @@
   const uint8x16_t expand_mask =
     vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
   /* Load bias pattern (alternating every pixel.) */
-  const uint16x8_t bias = { 0, 1, 0, 1, 0, 1, 0, 1 };
+  /* { 0, 1, 0, 1, 0, 1, 0, 1 } */
+  const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
   unsigned i, outrow;
 
   for (outrow = 0; outrow < v_samp_factor; outrow++) {
@@ -104,7 +105,7 @@
 
     /* Load pixels in last DCT block into a table. */
     uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
     /* Pad the empty elements with the value of the last pixel. */
     pixels = vqtbl1q_u8(pixels, expand_mask);
 #else
@@ -137,7 +138,8 @@
   const uint8x16_t expand_mask =
     vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
   /* Load bias pattern (alternating every pixel.) */
-  const uint16x8_t bias = { 1, 2, 1, 2, 1, 2, 1, 2 };
+  /* { 1, 2, 1, 2, 1, 2, 1, 2 } */
+  const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
   unsigned i, outrow;
 
   for (outrow = 0; outrow < v_samp_factor; outrow++) {
@@ -165,7 +167,7 @@
       vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
     uint8x16_t pixels_r1 =
       vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
     /* Pad the empty elements with the value of the last pixel. */
     pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
     pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
diff --git a/simd/arm/neon-compat.h.in b/simd/arm/neon-compat.h.in
index 7a03d81..e2347b9 100644
--- a/simd/arm/neon-compat.h.in
+++ b/simd/arm/neon-compat.h.in
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -21,3 +22,14 @@
 #cmakedefine HAVE_VLD1_S16_X3
 #cmakedefine HAVE_VLD1_U16_X2
 #cmakedefine HAVE_VLD1Q_U8_X4
+
+/* Define compiler-independent count-leading-zeros macros */
+#if defined(_MSC_VER) && !defined(__clang__)
+#define BUILTIN_CLZ(x)  _CountLeadingZeros(x)
+#define BUILTIN_CLZL(x)  _CountLeadingZeros64(x)
+#elif defined(__clang__) || defined(__GNUC__)
+#define BUILTIN_CLZ(x)  __builtin_clz(x)
+#define BUILTIN_CLZL(x)  __builtin_clzl(x)
+#else
+#error "Unknown compiler"
+#endif