Fix build errors with MSVC ARM64

(Contributed by Zhijie Liang)
diff --git a/arm/filter_neon_intrinsics.c b/arm/filter_neon_intrinsics.c
index ea7e356..2ee49cf 100644
--- a/arm/filter_neon_intrinsics.c
+++ b/arm/filter_neon_intrinsics.c
@@ -19,7 +19,11 @@
 /* This code requires -mfpu=neon on the command line: */
 #if PNG_ARM_NEON_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
 
+#if defined(_MSC_VER) && defined(_M_ARM64)
+#include <arm64_neon.h>
+#else
 #include <arm_neon.h>
+#endif
 
 /* libpng row pointers are not necessarily aligned to any particular boundary,
  * however this code will only work with appropriate alignment.  arm/arm_init.c
@@ -33,6 +37,11 @@
  * 'type'.  This is written this way just to hide the GCC strict aliasing
  * warning; note that the code is safe because there never is an alias between
  * the input and output pointers.
+ *
+ * When compiling with MSVC ARM64, the png_ldr macro can't be passed directly
+ * to vst4_lane_u32, because of an internal compiler error inside MSVC.
+ * To avoid this compiler bug, we use a temporary variable (vdest_val) to store
+ * the result of png_ldr.
  */
 #define png_ldr(type,pointer)\
    (temp_pointer = png_ptr(type,pointer), *temp_pointer)
@@ -130,7 +139,9 @@
       vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]);
       vdest.val[2] = vadd_u8(vdest.val[1], vrp.val[2]);
       vdest.val[3] = vadd_u8(vdest.val[2], vrp.val[3]);
-      vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
+
+      uint32x2x4_t vdest_val = png_ldr(uint32x2x4_t, &vdest);
+      vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0);
    }
 
    PNG_UNUSED(prev_row)
@@ -240,7 +251,8 @@
       vdest.val[3] = vhadd_u8(vdest.val[2], vpp.val[3]);
       vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
 
-      vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
+      uint32x2x4_t vdest_val = png_ldr(uint32x2x4_t, &vdest);
+      vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0);
    }
 }
 
@@ -378,7 +390,8 @@
 
       vlast = vpp.val[3];
 
-      vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
+      uint32x2x4_t vdest_val = png_ldr(uint32x2x4_t, &vdest);
+      vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0);
    }
 }