Neon: Fix Huffman enc. error w/Visual Studio+Clang

The GNU builtin function __builtin_clzl() accepts an unsigned long
argument, which is 8 bytes wide on LP64 systems (most Un*x systems,
including Mac) but 4 bytes wide on LLP64 systems (Windows.)  This caused
the Neon intrinsics implementation of Huffman encoding to produce
mathematically incorrect results when compiled using Visual Studio with
Clang.

This commit changes all invocations of __builtin_clzl() in the Neon SIMD
extensions to __builtin_clzll(), which accepts an unsigned long long
argument that is guaranteed to be 8 bytes wide on all systems.

Fixes #480
Closes #490
diff --git a/ChangeLog.md b/ChangeLog.md
index d312f27..92bf27e 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -11,6 +11,10 @@
 decompress a specially-crafted malformed progressive JPEG image caused the
 block smoothing algorithm to read from uninitialized memory.
 
+3. Fixed an issue in the Arm Neon SIMD Huffman encoders that caused the
+encoders to generate incorrect results when using the Clang compiler with
+Visual Studio.
+
 
 2.0.90 (2.1 beta1)
 ==================
diff --git a/simd/arm/aarch64/jchuff-neon.c b/simd/arm/aarch64/jchuff-neon.c
index a0a57a6..f13fd1b 100644
--- a/simd/arm/aarch64/jchuff-neon.c
+++ b/simd/arm/aarch64/jchuff-neon.c
@@ -1,7 +1,7 @@
 /*
  * jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon)
  *
- * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
  * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
@@ -331,7 +331,7 @@
     vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
 
     while (bitmap != 0) {
-      r = BUILTIN_CLZL(bitmap);
+      r = BUILTIN_CLZLL(bitmap);
       i += r;
       bitmap <<= r;
       nbits = block_nbits[i];
@@ -370,7 +370,7 @@
 
     /* Same as above but must mask diff bits and compute nbits on demand. */
     while (bitmap != 0) {
-      r = BUILTIN_CLZL(bitmap);
+      r = BUILTIN_CLZLL(bitmap);
       i += r;
       bitmap <<= r;
       lz = BUILTIN_CLZ(block_abs[i]);
diff --git a/simd/arm/jcphuff-neon.c b/simd/arm/jcphuff-neon.c
index 8b6d53b..86a263f 100644
--- a/simd/arm/jcphuff-neon.c
+++ b/simd/arm/jcphuff-neon.c
@@ -1,7 +1,7 @@
 /*
  * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
  *
- * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -572,7 +572,7 @@
     /* EOB position is defined to be 0 if all coefficients != 1. */
     return 0;
   } else {
-    return 63 - BUILTIN_CLZL(bitmap);
+    return 63 - BUILTIN_CLZLL(bitmap);
   }
 #else
   /* Move bitmap to two 32-bit scalar registers. */
diff --git a/simd/arm/neon-compat.h.in b/simd/arm/neon-compat.h.in
index e2347b9..23d6d28 100644
--- a/simd/arm/neon-compat.h.in
+++ b/simd/arm/neon-compat.h.in
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
- * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -26,10 +26,10 @@
 /* Define compiler-independent count-leading-zeros macros */
 #if defined(_MSC_VER) && !defined(__clang__)
 #define BUILTIN_CLZ(x)  _CountLeadingZeros(x)
-#define BUILTIN_CLZL(x)  _CountLeadingZeros64(x)
+#define BUILTIN_CLZLL(x)  _CountLeadingZeros64(x)
 #elif defined(__clang__) || defined(__GNUC__)
 #define BUILTIN_CLZ(x)  __builtin_clz(x)
-#define BUILTIN_CLZL(x)  __builtin_clzl(x)
+#define BUILTIN_CLZLL(x)  __builtin_clzll(x)
 #else
 #error "Unknown compiler"
 #endif