[libpng17] Port recent libpng-1.6.7beta03, beta04 changes to 1.7.0beta21

(mainly ARMv8 support)
diff --git a/ANNOUNCE b/ANNOUNCE
index dd3484c..004f0e2 100644
--- a/ANNOUNCE
+++ b/ANNOUNCE
@@ -1,5 +1,5 @@
 
-Libpng 1.7.0beta21 - October 13, 2013
+Libpng 1.7.0beta21 - November 2, 2013
 
 This is not intended to be a public release.  It will be replaced
 within a few weeks by a public version or by another test version.
@@ -398,7 +398,20 @@
   Make autogen.sh work with automake 1.13 as well as 1.14. Do this by always
     removing the 1.14 'compile' script but never checking for it.
 
-Version 1.7.0beta21 [October 13, 2013]
+Version 1.7.0beta21 [November 2, 2013]
+  Added ARMv8 support (James Yu <james.yu at linaro.org>).  Added file
+    arm/filter_neon_intrinsics.c; enable with -mfpu=neon.
+  Revised pngvalid to generate size images with as many filters as it can
+    manage, limited by the number of rows.
+  Cleaned up ARM NEON compilation handling. The tests are now in pngpriv.h
+    and detect the broken GCC compilers.
+  Allow clang derived from older GCC versions to use ARM intrinsics. This
+    causes all clang builds that use -mfpu=neon to use the intrinsics code,
+    not the assembler code.  This has only been tested on iOS 7. It may be
+    necessary to exclude some earlier clang versions but this seems unlikely.
+  Changed NEON implementation selection mechanism. This allows assembler
+    or intrinsics to be turned on at compile time during the build by defining
+    PNG_ARM_NEON_IMPLEMENTATION to the correct value (2 or 1).  This macro
 
 Send comments/corrections/commendations to png-mng-implement at lists.sf.net
 (subscription required; visit
diff --git a/CHANGES b/CHANGES
index cff8756..14f955e 100644
--- a/CHANGES
+++ b/CHANGES
@@ -4687,7 +4687,21 @@
   Make autogen.sh work with automake 1.13 as well as 1.14. Do this by always
     removing the 1.14 'compile' script but never checking for it.
 
-Version 1.7.0beta21 [October 13, 2013]
+Version 1.7.0beta21 [November 2, 2013]
+  Added ARMv8 support (James Yu <james.yu at linaro.org>).  Added file
+    arm/filter_neon_intrinsics.c; enable with -mfpu=neon.
+  Revised pngvalid to generate size images with as many filters as it can
+    manage, limited by the number of rows.
+  Cleaned up ARM NEON compilation handling. The tests are now in pngpriv.h
+    and detect the broken GCC compilers.
+  Allow clang derived from older GCC versions to use ARM intrinsics. This
+    causes all clang builds that use -mfpu=neon to use the intrinsics code,
+    not the assembler code.  This has only been tested on iOS 7. It may be
+    necessary to exclude some earlier clang versions but this seems unlikely.
+  Changed NEON implementation selection mechanism. This allows assembler
+    or intrinsics to be turned on at compile time during the build by defining
+    PNG_ARM_NEON_IMPLEMENTATION to the correct value (2 or 1).  This macro
+    is undefined by default and the build type is selected in pngpriv.h.
 
 Send comments/corrections/commendations to png-mng-implement at lists.sf.net
 (subscription required; visit
diff --git a/LICENSE b/LICENSE
index 4cc64bf..4038673 100644
--- a/LICENSE
+++ b/LICENSE
@@ -10,7 +10,7 @@
 
 This code is released under the libpng license.
 
-libpng versions 1.2.6, August 15, 2004, through 1.7.0beta21, October 13, 2013, are
+libpng versions 1.2.6, August 15, 2004, through 1.7.0beta21, November 2, 2013, are
 Copyright (c) 2004, 2006-2013 Glenn Randers-Pehrson, and are
 distributed according to the same disclaimer and license as libpng-1.2.5
 with the following individual added to the list of Contributing Authors
@@ -108,4 +108,4 @@
 
 Glenn Randers-Pehrson
 glennrp at users.sourceforge.net
-October 13, 2013
+November 2, 2013
diff --git a/Makefile.am b/Makefile.am
index 7b3dadb..c7b5916 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -82,7 +82,7 @@
 
 if PNG_ARM_NEON
 libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES += arm/arm_init.c\
-	arm/filter_neon.S
+	arm/filter_neon.S arm/filter_neon_intrinsics.c
 endif
 
 nodist_libpng@PNGLIB_MAJOR@@PNGLIB_MINOR@_la_SOURCES = pnglibconf.h
@@ -326,3 +326,8 @@
 	rm -f '$(DESTDIR)$(libdir)/libpng.sl'
 	rm -f '$(DESTDIR)$(libdir)/libpng.dylib'
 	rm -f '$(DESTDIR)$(libdir)/libpng.dll.a'
+
+# The following addition ensures that 'make all' always builds the test programs
+# too.  It used to, but some change either in libpng or configure stopped this
+# working.
+all-am: $(check_PROGRAMS)
diff --git a/README b/README
index 3b8f30e..f9083fc 100644
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
-README for libpng version 1.7.0beta21 - October 13, 2013 (shared library 17.0)
+README for libpng version 1.7.0beta21 - November 2, 2013 (shared library 17.0)
 See the note about version numbers near the top of png.h
 
 See INSTALL for instructions on how to install libpng.
diff --git a/arm/filter_neon.S b/arm/filter_neon.S
index b8aef10..c5d0b53 100644
--- a/arm/filter_neon.S
+++ b/arm/filter_neon.S
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 2013 Glenn Randers-Pehrson
  * Written by Mans Rullgard, 2011.
- * Last changed in libpng 1.5.17 [July 18, 2013]
+ * Last changed in libpng 1.6.7 [(PENDING RELEASE)]
  *
  * This code is released under the libpng license.
  * For conditions of distribution and use, see the disclaimer
@@ -20,6 +20,13 @@
 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
 #endif
 
+/* Assembler NEON support - only works for 32-bit ARM (i.e. it does not work for
+ * ARM64).  The code in arm/filter_neon_intrinsics.c supports ARM64, however it
+ * only works if -mfpu=neon is specified on the GCC command line.  See pngpriv.h
+ * for the logic which sets PNG_USE_ARM_NEON_ASM:
+ */
+#if PNG_ARM_NEON_IMPLEMENTATION == 2 /* hand-coded assembler */
+
 #ifdef PNG_READ_SUPPORTED
 #if PNG_ARM_NEON_OPT > 0
 
@@ -235,3 +242,4 @@
 endfunc
 #endif /* PNG_ARM_NEON_OPT > 0 */
 #endif /* PNG_READ_SUPPORTED */
+#endif /* PNG_ARM_NEON_IMPLEMENTATION == 2 (assembler) */
diff --git a/arm/filter_neon_intrinsics.c b/arm/filter_neon_intrinsics.c
new file mode 100644
index 0000000..ba39d61
--- /dev/null
+++ b/arm/filter_neon_intrinsics.c
@@ -0,0 +1,372 @@
+
+/* filter_neon_intrinsics.c - NEON optimised filter functions
+ *
+ * Copyright (c) 2013 Glenn Randers-Pehrson
+ * Written by James Yu <james.yu at linaro.org>, October 2013.
+ * Based on filter_neon.S, written by Mans Rullgard, 2011.
+ *
+ * Last changed in libpng 1.6.7 [(PENDING RELEASE)]
+ *
+ * This code is released under the libpng license.
+ * For conditions of distribution and use, see the disclaimer
+ * and license in png.h
+ */
+
+#include "pngpriv.h"
+
+/* This code requires -mfpu=neon on the command line: */
+#if PNG_ARM_NEON_IMPLEMENTATION == 1 /* intrinsics code */
+
+#include <arm_neon.h>
+
+/* libpng row pointers are not necessarily aligned to any particular boundary,
+ * however this code will only work with appropriate alignment.  arm/arm_init.c
+ * checks for this (and will not compile unless it is done), this code uses
+ * variants of png_aligncast to avoid compiler warnings.
+ */
+#define png_ptr(type,pointer) png_aligncast(type *,pointer)
+#define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer)
+
+/* The following relies on a variable 'temp_pointer' being declared with type
+ * 'type'.  This is written this way just to hide the GCC strict aliasing
+ * warning; note that the code is safe because there never is an alias between
+ * the input and output pointers.
+ */
+#define png_ldr(type,pointer)\
+   (temp_pointer = png_ptr(type,pointer), *temp_pointer)
+
+#ifdef PNG_READ_SUPPORTED
+#if PNG_ARM_NEON_OPT > 0
+
+void
+png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row,
+   png_const_bytep prev_row)
+{
+   png_bytep rp = row;
+   png_bytep rp_stop = row + row_info->rowbytes;
+   png_const_bytep pp = prev_row;
+
+   for (; rp < rp_stop; rp += 16, pp += 16)
+   {
+      uint8x16_t qrp, qpp;
+
+      qrp = vld1q_u8(rp);
+      qpp = vld1q_u8(pp);
+      qrp = vaddq_u8(qrp, qpp);
+      vst1q_u8(rp, qrp);
+   }
+}
+
+void
+png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row,
+   png_const_bytep prev_row)
+{
+   png_bytep rp = row;
+   png_bytep rp_stop = row + row_info->rowbytes;
+
+   uint8x16_t vtmp = vld1q_u8(rp);
+   uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp);
+   uint8x8x2_t vrp = *vrpt;
+
+   uint8x8x4_t vdest;
+   vdest.val[3] = vdup_n_u8(0);
+
+   for (; rp < rp_stop;)
+   {
+      uint8x8_t vtmp1, vtmp2;
+      uint32x2_t *temp_pointer;
+
+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
+      vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
+      vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6);
+      vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);
+
+      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
+      vdest.val[2] = vadd_u8(vdest.val[1], vtmp2);
+      vdest.val[3] = vadd_u8(vdest.val[2], vtmp1);
+
+      vtmp = vld1q_u8(rp + 12);
+      vrpt = png_ptr(uint8x8x2_t, &vtmp);
+      vrp = *vrpt;
+
+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
+      rp += 3;
+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
+      rp += 3;
+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
+      rp += 3;
+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
+      rp += 3;
+   }
+
+   PNG_UNUSED(prev_row)
+}
+
+void
+png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row,
+   png_const_bytep prev_row)
+{
+   png_bytep rp = row;
+   png_bytep rp_stop = row + row_info->rowbytes;
+
+   uint8x8x4_t vdest;
+   vdest.val[3] = vdup_n_u8(0);
+
+   for (; rp < rp_stop; rp += 16)
+   {
+      uint32x2x4_t vtmp = vld4_u32(png_ptr(uint32_t,rp));
+      uint8x8x4_t *vrpt = png_ptr(uint8x8x4_t,&vtmp);
+      uint8x8x4_t vrp = *vrpt;
+      uint32x2x4_t *temp_pointer;
+
+      vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
+      vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]);
+      vdest.val[2] = vadd_u8(vdest.val[1], vrp.val[2]);
+      vdest.val[3] = vadd_u8(vdest.val[2], vrp.val[3]);
+      vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
+   }
+
+   PNG_UNUSED(prev_row)
+}
+
+void
+png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row,
+   png_const_bytep prev_row)
+{
+   png_bytep rp = row;
+   png_const_bytep pp = prev_row;
+   png_bytep rp_stop = row + row_info->rowbytes;
+
+   uint8x16_t vtmp;
+   uint8x8x2_t *vrpt;
+   uint8x8x2_t vrp;
+   uint8x8x4_t vdest;
+   vdest.val[3] = vdup_n_u8(0);
+
+   vtmp = vld1q_u8(rp);
+   vrpt = png_ptr(uint8x8x2_t,&vtmp);
+   vrp = *vrpt;
+
+   for (; rp < rp_stop; pp += 12)
+   {
+      uint8x8_t vtmp1, vtmp2, vtmp3;
+
+      uint8x8x2_t *vppt;
+      uint8x8x2_t vpp;
+
+      uint32x2_t *temp_pointer;
+
+      vtmp = vld1q_u8(pp);
+      vppt = png_ptr(uint8x8x2_t,&vtmp);
+      vpp = *vppt;
+
+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
+      vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+
+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
+      vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6);
+      vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
+
+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6);
+      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
+
+      vtmp = vld1q_u8(rp + 12);
+      vrpt = png_ptr(uint8x8x2_t,&vtmp);
+      vrp = *vrpt;
+
+      vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2);
+      vdest.val[2] = vadd_u8(vdest.val[2], vtmp3);
+
+      vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
+
+      vdest.val[3] = vhadd_u8(vdest.val[2], vtmp2);
+      vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
+
+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
+      rp += 3;
+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
+      rp += 3;
+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
+      rp += 3;
+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
+      rp += 3;
+   }
+}
+
+void
+png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row,
+   png_const_bytep prev_row)
+{
+   png_bytep rp = row;
+   png_bytep rp_stop = row + row_info->rowbytes;
+   png_const_bytep pp = prev_row;
+
+   uint8x8x4_t vdest;
+   vdest.val[3] = vdup_n_u8(0);
+
+   for (; rp < rp_stop; rp += 16, pp += 16)
+   {
+      uint32x2x4_t vtmp;
+      uint8x8x4_t *vrpt, *vppt;
+      uint8x8x4_t vrp, vpp;
+      uint32x2x4_t *temp_pointer;
+
+      vtmp = vld4_u32(png_ptr(uint32_t,rp));
+      vrpt = png_ptr(uint8x8x4_t,&vtmp);
+      vrp = *vrpt;
+      vtmp = vld4_u32(png_ptrc(uint32_t,pp));
+      vppt = png_ptr(uint8x8x4_t,&vtmp);
+      vpp = *vppt;
+
+      vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+      vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]);
+      vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
+      vdest.val[2] = vhadd_u8(vdest.val[1], vpp.val[2]);
+      vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
+      vdest.val[3] = vhadd_u8(vdest.val[2], vpp.val[3]);
+      vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
+
+      vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
+   }
+}
+
+static uint8x8_t
+paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c)
+{
+   uint8x8_t d, e;
+   uint16x8_t p1, pa, pb, pc;
+
+   p1 = vaddl_u8(a, b); /* a + b */
+   pc = vaddl_u8(c, c); /* c * 2 */
+   pa = vabdl_u8(b, c); /* pa */
+   pb = vabdl_u8(a, c); /* pb */
+   pc = vabdq_u16(p1, pc); /* pc */
+
+   p1 = vcleq_u16(pa, pb); /* pa <= pb */
+   pa = vcleq_u16(pa, pc); /* pa <= pc */
+   pb = vcleq_u16(pb, pc); /* pb <= pc */
+
+   p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */
+
+   d = vmovn_u16(pb);
+   e = vmovn_u16(p1);
+
+   d = vbsl_u8(d, b, c);
+   e = vbsl_u8(e, a, d);
+
+   return e;
+}
+
+void
+png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row,
+   png_const_bytep prev_row)
+{
+   png_bytep rp = row;
+   png_const_bytep pp = prev_row;
+   png_bytep rp_stop = row + row_info->rowbytes;
+
+   uint8x16_t vtmp;
+   uint8x8x2_t *vrpt;
+   uint8x8x2_t vrp;
+   uint8x8_t vlast = vdup_n_u8(0);
+   uint8x8x4_t vdest;
+   vdest.val[3] = vdup_n_u8(0);
+
+   vtmp = vld1q_u8(rp);
+   vrpt = png_ptr(uint8x8x2_t,&vtmp);
+   vrp = *vrpt;
+
+   for (; rp < rp_stop; pp += 12)
+   {
+      uint8x8x2_t *vppt;
+      uint8x8x2_t vpp;
+      uint8x8_t vtmp1, vtmp2, vtmp3;
+      uint32x2_t *temp_pointer;
+
+      vtmp = vld1q_u8(pp);
+      vppt = png_ptr(uint8x8x2_t,&vtmp);
+      vpp = *vppt;
+
+      vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+
+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
+      vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
+
+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6);
+      vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6);
+      vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2);
+      vdest.val[2] = vadd_u8(vdest.val[2], vtmp1);
+
+      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
+      vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
+
+      vtmp = vld1q_u8(rp + 12);
+      vrpt = png_ptr(uint8x8x2_t,&vtmp);
+      vrp = *vrpt;
+
+      vdest.val[3] = paeth(vdest.val[2], vtmp2, vtmp3);
+      vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
+
+      vlast = vtmp2;
+
+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
+      rp += 3;
+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
+      rp += 3;
+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
+      rp += 3;
+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
+      rp += 3;
+   }
+}
+
+void
+png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row,
+   png_const_bytep prev_row)
+{
+   png_bytep rp = row;
+   png_bytep rp_stop = row + row_info->rowbytes;
+   png_const_bytep pp = prev_row;
+
+   uint8x8_t vlast = vdup_n_u8(0);
+   uint8x8x4_t vdest;
+   vdest.val[3] = vdup_n_u8(0);
+
+   for (; rp < rp_stop; rp += 16, pp += 16)
+   {
+      uint32x2x4_t vtmp;
+      uint8x8x4_t *vrpt, *vppt;
+      uint8x8x4_t vrp, vpp;
+      uint32x2x4_t *temp_pointer;
+
+      vtmp = vld4_u32(png_ptr(uint32_t,rp));
+      vrpt = png_ptr(uint8x8x4_t,&vtmp);
+      vrp = *vrpt;
+      vtmp = vld4_u32(png_ptrc(uint32_t,pp));
+      vppt = png_ptr(uint8x8x4_t,&vtmp);
+      vpp = *vppt;
+
+      vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+      vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]);
+      vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
+      vdest.val[2] = paeth(vdest.val[1], vpp.val[2], vpp.val[1]);
+      vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
+      vdest.val[3] = paeth(vdest.val[2], vpp.val[3], vpp.val[2]);
+      vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
+
+      vlast = vpp.val[3];
+
+      vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
+   }
+}
+
+#endif /* PNG_ARM_NEON_OPT > 0 */
+#endif /* PNG_READ_SUPPORTED */
+#endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */
diff --git a/contrib/libtests/pngvalid.c b/contrib/libtests/pngvalid.c
index 6eb1fe0..cfd22b2 100644
--- a/contrib/libtests/pngvalid.c
+++ b/contrib/libtests/pngvalid.c
@@ -2963,6 +2963,12 @@
  * height of 16 rows.  The width and height are stored in the FILEID and, being
  * non-zero, indicate a size file.
  *
+ * Because the PNG filter code is typically the largest CPU consumer within
+ * libpng itself there is a tendency to attempt to optimize it.  This results in
+ * special case code which needs to be validated.  To cause this to happen the
+ * 'size' images are made to use each possible filter, in so far as this is
+ * possible for smaller images.
+ *
  * For palette image (colour type 3) multiple transform images are stored with
  * the same bit depth to allow testing of more colour combinations -
  * particularly important for testing the gamma code because libpng uses a
@@ -3678,6 +3684,7 @@
          int npasses = npasses_from_interlace_type(pp, interlace_type);
          png_uint_32 y;
          int pass;
+         int nfilter = PNG_FILTER_VALUE_LAST;
          png_byte image[16][SIZE_ROWMAX];
 
          /* To help consistent error detection make the parts of this buffer
@@ -3731,7 +3738,22 @@
                      continue;
                }
 
-               /* Only get to here if the row has some pixels in it. */
+               /* Only get to here if the row has some pixels in it, set the
+                * filters to 'all' for the very first row and thereafter to a
+                * single filter.  It isn't well documented, but png_set_filter
+                * does accept a filter number (per the spec) as well as a bit
+                * mask.
+                *
+                * The apparent wackiness of decrementing nfilter rather than
+                * incrementing is so that Paeth gets used in all images bigger
+                * than 1 row - it's the tricky one.
+                */
+               png_set_filter(pp, 0/*method*/,
+                  nfilter >= PNG_FILTER_VALUE_LAST ? PNG_ALL_FILTERS : nfilter);
+
+               if (nfilter-- == 0)
+                  nfilter = PNG_FILTER_VALUE_LAST-1;
+
                png_write_row(pp, row);
             }
          }
diff --git a/libpng-manual.txt b/libpng-manual.txt
index 5866c15..f921a2f 100644
--- a/libpng-manual.txt
+++ b/libpng-manual.txt
@@ -1,6 +1,6 @@
 libpng-manual.txt - A description on how to use and modify libpng
 
- libpng version 1.7.0beta21 - October 13, 2013
+ libpng version 1.7.0beta21 - November 2, 2013
  Updated and distributed by Glenn Randers-Pehrson
  <glennrp at users.sourceforge.net>
  Copyright (c) 1998-2013 Glenn Randers-Pehrson
@@ -11,7 +11,7 @@
 
  Based on:
 
- libpng versions 0.97, January 1998, through 1.7.0beta21 - October 13, 2013
+ libpng versions 0.97, January 1998, through 1.7.0beta21 - November 2, 2013
  Updated and distributed by Glenn Randers-Pehrson
  Copyright (c) 1998-2013 Glenn Randers-Pehrson
 
@@ -5001,10 +5001,11 @@
 
 Error detection in some chunks has improved; in particular the iCCP chunk
 reader now does pretty complete validation of the basic format.  Some bad
-profiles that were previously accepted are now rejected, in particular the
-very old broken Microsoft/HP sRGB profile.  The PNG spec requirement that
-only grayscale profiles may appear in images with color type 0 or 4 and that
-even if the image only contains gray pixels, only RGB profiles may appear
+profiles that were previously accepted are now accepted with a warning or
+rejected, depending upon the png_set_benign_errors() setting, in particular the
+very old broken Microsoft/HP 3144-byte sRGB profile.  The PNG spec requirement
+that only grayscale profiles may appear in images with color type 0 or 4 and
+that even if the image only contains gray pixels, only RGB profiles may appear
 in images with color type 2, 3, or 6, is now enforced.  The sRGB chunk
 is allowed to appear in images with any color type.
 
@@ -5013,7 +5014,9 @@
 are allowed by the PNG specification, so these warnings are no longer issued.
 
 The library now issues an error if the application attempts to set a
-transform after it calls png_read_update_info().
+transform after it calls png_read_update_info() or if it attempts to call
+both png_read_update_info() and png_start_read_image() or to call either
+of them more than once.
 
 The default condition for benign_errors is now to treat benign errors as
 warnings while reading and as errors while writing.
@@ -5247,7 +5250,7 @@
 
 XVII. Y2K Compliance in libpng
 
-October 13, 2013
+November 2, 2013
 
 Since the PNG Development group is an ad-hoc body, we can't make
 an official declaration.
diff --git a/libpng.3 b/libpng.3
index adb41d0..6bc6a72 100644
--- a/libpng.3
+++ b/libpng.3
@@ -1,4 +1,4 @@
-.TH LIBPNG 3 "October 13, 2013"
+.TH LIBPNG 3 "November 2, 2013"
 .SH NAME
 libpng \- Portable Network Graphics (PNG) Reference Library 1.7.0beta21
 .SH SYNOPSIS
@@ -494,7 +494,7 @@
 .SH LIBPNG.TXT
 libpng-manual.txt - A description on how to use and modify libpng
 
- libpng version 1.7.0beta21 - October 13, 2013
+ libpng version 1.7.0beta21 - November 2, 2013
  Updated and distributed by Glenn Randers-Pehrson
  <glennrp at users.sourceforge.net>
  Copyright (c) 1998-2013 Glenn Randers-Pehrson
@@ -505,7 +505,7 @@
 
  Based on:
 
- libpng versions 0.97, January 1998, through 1.7.0beta21 - October 13, 2013
+ libpng versions 0.97, January 1998, through 1.7.0beta21 - November 2, 2013
  Updated and distributed by Glenn Randers-Pehrson
  Copyright (c) 1998-2013 Glenn Randers-Pehrson
 
@@ -5496,10 +5496,11 @@
 
 Error detection in some chunks has improved; in particular the iCCP chunk
 reader now does pretty complete validation of the basic format.  Some bad
-profiles that were previously accepted are now rejected, in particular the
-very old broken Microsoft/HP sRGB profile.  The PNG spec requirement that
-only grayscale profiles may appear in images with color type 0 or 4 and that
-even if the image only contains gray pixels, only RGB profiles may appear
+profiles that were previously accepted are now accepted with a warning or
+rejected, depending upon the png_set_benign_errors() setting, in particular the
+very old broken Microsoft/HP 3144-byte sRGB profile.  The PNG spec requirement
+that only grayscale profiles may appear in images with color type 0 or 4 and
+that even if the image only contains gray pixels, only RGB profiles may appear
 in images with color type 2, 3, or 6, is now enforced.  The sRGB chunk
 is allowed to appear in images with any color type.
 
@@ -5508,7 +5509,9 @@
 are allowed by the PNG specification, so these warnings are no longer issued.
 
 The library now issues an error if the application attempts to set a
-transform after it calls png_read_update_info().
+transform after it calls png_read_update_info() or if it attempts to call
+both png_read_update_info() and png_start_read_image() or to call either
+of them more than once.
 
 The default condition for benign_errors is now to treat benign errors as
 warnings while reading and as errors while writing.
@@ -5742,7 +5745,7 @@
 
 .SH XVII. Y2K Compliance in libpng
 
-October 13, 2013
+November 2, 2013
 
 Since the PNG Development group is an ad-hoc body, we can't make
 an official declaration.
@@ -6012,7 +6015,7 @@
 
 Thanks to Frank J. T. Wojcik for helping with the documentation.
 
-Libpng version 1.7.0beta21 - October 13, 2013:
+Libpng version 1.7.0beta21 - November 2, 2013:
 Initially created in 1995 by Guy Eric Schalnat, then of Group 42, Inc.
 Currently maintained by Glenn Randers-Pehrson (glennrp at users.sourceforge.net).
 
@@ -6035,7 +6038,7 @@
 
 This code is released under the libpng license.
 
-libpng versions 1.2.6, August 15, 2004, through 1.7.0beta21, October 13, 2013, are
+libpng versions 1.2.6, August 15, 2004, through 1.7.0beta21, November 2, 2013, are
 Copyright (c) 2004,2006-2007 Glenn Randers-Pehrson, and are
 distributed according to the same disclaimer and license as libpng-1.2.5
 with the following individual added to the list of Contributing Authors
@@ -6134,7 +6137,7 @@
 
 Glenn Randers-Pehrson
 glennrp at users.sourceforge.net
-October 13, 2013
+November 2, 2013
 
 .\" end of man page
 
diff --git a/libpngpf.3 b/libpngpf.3
index 529dd92..c7014fc 100644
--- a/libpngpf.3
+++ b/libpngpf.3
@@ -1,4 +1,4 @@
-.TH LIBPNGPF 3 "October 13, 2013"
+.TH LIBPNGPF 3 "November 2, 2013"
 .SH NAME
 libpng \- Portable Network Graphics (PNG) Reference Library 1.7.0beta21
 (private functions)
diff --git a/png.5 b/png.5
index f33fff2..45d2047 100644
--- a/png.5
+++ b/png.5
@@ -1,4 +1,4 @@
-.TH PNG 5 "October 13, 2013"
+.TH PNG 5 "November 2, 2013"
 .SH NAME
 png \- Portable Network Graphics (PNG) format
 .SH DESCRIPTION
diff --git a/png.c b/png.c
index f9710f7..0eabb25 100644
--- a/png.c
+++ b/png.c
@@ -691,13 +691,13 @@
 #else
 #  ifdef __STDC__
    return PNG_STRING_NEWLINE \
-     "libpng version 1.7.0beta21 - October 13, 2013" PNG_STRING_NEWLINE \
+     "libpng version 1.7.0beta21 - November 2, 2013" PNG_STRING_NEWLINE \
      "Copyright (c) 1998-2013 Glenn Randers-Pehrson" PNG_STRING_NEWLINE \
      "Copyright (c) 1996-1997 Andreas Dilger" PNG_STRING_NEWLINE \
      "Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc." \
      PNG_STRING_NEWLINE;
 #  else
-      return "libpng version 1.7.0beta21 - October 13, 2013\
+      return "libpng version 1.7.0beta21 - November 2, 2013\
       Copyright (c) 1998-2013 Glenn Randers-Pehrson\
       Copyright (c) 1996-1997 Andreas Dilger\
       Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.";
diff --git a/png.h b/png.h
index 0178841..19abd60 100644
--- a/png.h
+++ b/png.h
@@ -1,7 +1,7 @@
 
 /* png.h - header file for PNG reference library
  *
- * libpng version 1.7.0beta21 - October 13, 2013
+ * libpng version 1.7.0beta21 - November 2, 2013
  * Copyright (c) 1998-2013 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
  * (Version 0.88 Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.)
@@ -11,7 +11,7 @@
  * Authors and maintainers:
  *   libpng versions 0.71, May 1995, through 0.88, January 1996: Guy Schalnat
  *   libpng versions 0.89c, June 1996, through 0.96, May 1997: Andreas Dilger
- *   libpng versions 0.97, January 1998, through 1.7.0beta21 - October 13, 2013: Glenn
+ *   libpng versions 0.97, January 1998, through 1.7.0beta21 - November 2, 2013: Glenn
  *   See also "Contributing Authors", below.
  *
  * Note about libpng version numbers:
@@ -200,7 +200,7 @@
  *
  * This code is released under the libpng license.
  *
- * libpng versions 1.2.6, August 15, 2004, through 1.7.0beta21, October 13, 2013, are
+ * libpng versions 1.2.6, August 15, 2004, through 1.7.0beta21, November 2, 2013, are
  * Copyright (c) 2004, 2006-2013 Glenn Randers-Pehrson, and are
  * distributed according to the same disclaimer and license as libpng-1.2.5
  * with the following individual added to the list of Contributing Authors:
@@ -312,7 +312,7 @@
  * Y2K compliance in libpng:
  * =========================
  *
- *    October 13, 2013
+ *    November 2, 2013
  *
  *    Since the PNG Development group is an ad-hoc body, we can't make
  *    an official declaration.
@@ -380,7 +380,7 @@
 /* Version information for png.h - this should match the version in png.c */
 #define PNG_LIBPNG_VER_STRING "1.7.0beta21"
 #define PNG_HEADER_VERSION_STRING \
-     " libpng version 1.7.0beta21 - October 13, 2013\n"
+     " libpng version 1.7.0beta21 - November 2, 2013\n"
 
 #define PNG_LIBPNG_VER_SONUM   17
 #define PNG_LIBPNG_VER_DLLNUM  17
diff --git a/pngconf.h b/pngconf.h
index d68cf7e..516de80 100644
--- a/pngconf.h
+++ b/pngconf.h
@@ -1,7 +1,7 @@
 
 /* pngconf.h - machine configurable file for libpng
  *
- * libpng version 1.7.0beta21 - October 13, 2013
+ * libpng version 1.7.0beta21 - November 2, 2013
  *
  * Copyright (c) 1998-2013 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
diff --git a/pngpriv.h b/pngpriv.h
index 200dd43..8143dad 100644
--- a/pngpriv.h
+++ b/pngpriv.h
@@ -163,7 +163,49 @@
     * callbacks to do this.
     */
 #  define PNG_FILTER_OPTIMIZATIONS png_init_filter_functions_neon
-#endif
+
+   /* By default the 'intrinsics' code in arm/filter_neon_intrinsics.c is used
+    * if possible - if __ARM_NEON__ is set and the compiler version is not known
+    * to be broken.  This is control by PNG_ARM_NEON_IMPLEMENTATION which can
+    * be:
+    *
+    *    1  The intrinsics code (the default with __ARM_NEON__)
+    *    2  The hand coded assembler (the default without __ARM_NEON__)
+    *
+    * It is possible to set PNG_ARM_NEON_IMPLEMENTATION in CPPFLAGS, however
+    * this is *NOT* supported and may cease to work even after a minor revision
+    * to libpng.  It *is* valid to do this for testing purposes, e.g. speed
+    * testing or a new compiler, but the results should be communicated to the
+    * libpng implementation list for incorporation in the next minor release.
+    */
+#  ifndef PNG_ARM_NEON_IMPLEMENTATION
+#     ifdef __ARM_NEON__
+#        if defined(__clang__)
+            /* At present it is unknown by the libpng developers which versions
+             * of clang support the intrinsics, however some or perhaps all
+             * versions do not work with the assembler so this may be
+             * irrelevant, so just use the default (do nothing here.)
+             */
+#        elif defined(__GNUC__)
+            /* GCC 4.5.4 NEON support is known to be broken.  4.6.3 is known to
+             * work, so if this *is* GCC, or G++, look for a version >4.5
+             */
+#           if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6)
+#              define PNG_ARM_NEON_IMPLEMENTATION 2
+#           endif /* no GNUC support */
+#        endif /* __GNUC__ */
+#     else /* !defined __ARM_NEON__ */
+         /* The 'intrinsics' code simply won't compile without this -mfpu=neon:
+          */
+#        define PNG_ARM_NEON_IMPLEMENTATION 2
+#     endif /* __ARM_NEON__ */
+#  endif /* !defined PNG_ARM_NEON_IMPLEMENTATION */
+
+#  ifndef PNG_ARM_NEON_IMPLEMENTATION
+      /* Use the intrinsics code by default. */
+#     define PNG_ARM_NEON_IMPLEMENTATION 1
+#  endif
+#endif /* PNG_ARM_NEON_OPT > 0 */
 
 /* Is this a build of a DLL where compilation of the object modules requires
  * different preprocessor settings to those required for a simple library?  If
diff --git a/projects/vstudio/readme.txt b/projects/vstudio/readme.txt
index 742313f..21e544d 100644
--- a/projects/vstudio/readme.txt
+++ b/projects/vstudio/readme.txt
@@ -1,7 +1,7 @@
 
 VisualStudio instructions
 
-libpng version 1.7.0beta21 - October 13, 2013
+libpng version 1.7.0beta21 - November 2, 2013
 
 Copyright (c) 1998-2010 Glenn Randers-Pehrson
 
diff --git a/projects/vstudio/zlib.props b/projects/vstudio/zlib.props
index b56817a..37c2a76 100644
--- a/projects/vstudio/zlib.props
+++ b/projects/vstudio/zlib.props
@@ -2,7 +2,7 @@
 <!--
  * zlib.props - location of zlib source
  *
- * libpng version 1.7.0beta21 - October 13, 2013
+ * libpng version 1.7.0beta21 - November 2, 2013
  *
  * Copyright (c) 1998-2011 Glenn Randers-Pehrson
  *
diff --git a/scripts/README.txt b/scripts/README.txt
index 6adedc2..c99fedc 100644
--- a/scripts/README.txt
+++ b/scripts/README.txt
@@ -1,5 +1,5 @@
 
-Makefiles for  libpng version 1.7.0beta21 - October 13, 2013
+Makefiles for  libpng version 1.7.0beta21 - November 2, 2013
 
 pnglibconf.h.prebuilt       =>  Stores configuration settings
  makefile.linux    =>  Linux/ELF makefile
diff --git a/scripts/pnglibconf.h.prebuilt b/scripts/pnglibconf.h.prebuilt
index d96e6fb..ae373bf 100644
--- a/scripts/pnglibconf.h.prebuilt
+++ b/scripts/pnglibconf.h.prebuilt
@@ -2,7 +2,7 @@
 
 /* pnglibconf.h - library build configuration */
 
-/* Libpng version 1.7.0beta21 - October 13, 2013 */
+/* Libpng version 1.7.0beta21 - November 2, 2013 */
 
 /* Copyright (c) 1998-2013 Glenn Randers-Pehrson */