[libpng15] Added support for ARM processor (Mans Rullgard)
diff --git a/ANNOUNCE b/ANNOUNCE
index eb771a0..11520a2 100644
--- a/ANNOUNCE
+++ b/ANNOUNCE
@@ -27,6 +27,7 @@
 Changes since the last public release (1.5.6):
 
 Version 1.5.7 [November 3, 2011]
+  Added support for ARM processor (Mans Rullgard)
 
 Send comments/corrections/commendations to png-mng-implement at lists.sf.net:
 (subscription required; visit
diff --git a/CHANGES b/CHANGES
index d3aedb4..305ffea 100644
--- a/CHANGES
+++ b/CHANGES
@@ -3670,6 +3670,7 @@
   No changes.
 
 Version 1.5.7 [November 3, 2011]
+  Added support for ARM processor (Mans Rullgard)
 
 Send comments/corrections/commendations to png-mng-implement at lists.sf.net
 (subscription required; visit
diff --git a/arm/filter_neon.S b/arm/filter_neon.S
new file mode 100644
index 0000000..a65ab0c
--- /dev/null
+++ b/arm/filter_neon.S
@@ -0,0 +1,220 @@
+
+/* filter_neon.S - NEON optimised filter functions
+ *
+ * Copyright (c) 2011 Mans Rullgard
+ *
+ * This code is released under the libpng license.
+ * For conditions of distribution and use, see the disclaimer
+ * and license in png.h
+ */
+
+#ifdef __ELF__
+#   define ELF
+#else
+#   define ELF @
+#endif
+
+        .arch armv7-a
+        .fpu  neon
+
+.macro  func    name, export=0
+    .macro endfunc
+ELF     .size   \name, . - \name
+        .endfunc
+        .purgem endfunc
+    .endm
+        .text
+    .if \export
+        .global \name
+    .endif
+ELF     .type   \name, STT_FUNC
+        .func   \name
+\name:
+.endm
+
+func    png_read_filter_row_sub4_neon, export=1
+        ldr             r3,  [r0, #4]           @ rowbytes
+        vmov.i8         d3,  #0
+1:
+        vld4.32         {d4[],d5[],d6[],d7[]},    [r1,:128]
+        vadd.u8         d0,  d3,  d4
+        vadd.u8         d1,  d0,  d5
+        vadd.u8         d2,  d1,  d6
+        vadd.u8         d3,  d2,  d7
+        vst4.32         {d0[0],d1[0],d2[0],d3[0]},[r1,:128]!
+        subs            r3,  r3,  #16
+        bgt             1b
+
+        bx              lr
+endfunc
+
+func    png_read_filter_row_sub3_neon, export=1
+        ldr             r3,  [r0, #4]           @ rowbytes
+        vmov.i8         d3,  #0
+        mov             r0,  r1
+        mov             r2,  #3
+        mov             r12, #12
+        vld1.8          {q11},    [r0], r12
+1:
+        vext.8          d5,  d22, d23, #3
+        vadd.u8         d0,  d3,  d22
+        vext.8          d6,  d22, d23, #6
+        vadd.u8         d1,  d0,  d5
+        vext.8          d7,  d23, d23, #1
+        vld1.8          {q11},    [r0], r12
+        vst1.32         {d0[0]},  [r1,:32], r2
+        vadd.u8         d2,  d1,  d6
+        vst1.32         {d1[0]},  [r1], r2
+        vadd.u8         d3,  d2,  d7
+        vst1.32         {d2[0]},  [r1], r2
+        vst1.32         {d3[0]},  [r1], r2
+        subs            r3,  r3,  #12
+        bgt             1b
+
+        bx              lr
+endfunc
+
+func    png_read_filter_row_up_neon, export=1
+        ldr             r3,  [r0, #4]           @ rowbytes
+1:
+        vld1.8          {q0}, [r1,:128]
+        vld1.8          {q1}, [r2,:128]!
+        vadd.u8         q0,  q0,  q1
+        vst1.8          {q0}, [r1,:128]!
+        subs            r3,  r3,  #16
+        bgt             1b
+
+        bx              lr
+endfunc
+
+func    png_read_filter_row_avg4_neon, export=1
+        ldr             r12, [r0, #4]           @ rowbytes
+        vmov.i8         d3,  #0
+1:
+        vld4.32         {d4[],d5[],d6[],d7[]},    [r1,:128]
+        vld4.32         {d16[],d17[],d18[],d19[]},[r2,:128]!
+        vhadd.u8        d0,  d3,  d16
+        vadd.u8         d0,  d0,  d4
+        vhadd.u8        d1,  d0,  d17
+        vadd.u8         d1,  d1,  d5
+        vhadd.u8        d2,  d1,  d18
+        vadd.u8         d2,  d2,  d6
+        vhadd.u8        d3,  d2,  d19
+        vadd.u8         d3,  d3,  d7
+        vst4.32         {d0[0],d1[0],d2[0],d3[0]},[r1,:128]!
+        subs            r12, r12, #16
+        bgt             1b
+
+        bx              lr
+endfunc
+
+func    png_read_filter_row_avg3_neon, export=1
+        push            {r4,lr}
+        ldr             r12, [r0, #4]           @ rowbytes
+        vmov.i8         d3,  #0
+        mov             r0,  r1
+        mov             r4,  #3
+        mov             lr,  #12
+        vld1.8          {q11},    [r0], lr
+1:
+        vld1.8          {q10},    [r2], lr
+        vext.8          d5,  d22, d23, #3
+        vhadd.u8        d0,  d3,  d20
+        vext.8          d17, d20, d21, #3
+        vadd.u8         d0,  d0,  d22
+        vext.8          d6,  d22, d23, #6
+        vhadd.u8        d1,  d0,  d17
+        vext.8          d18, d20, d21, #6
+        vadd.u8         d1,  d1,  d5
+        vext.8          d7,  d23, d23, #1
+        vld1.8          {q11},    [r0], lr
+        vst1.32         {d0[0]},  [r1,:32], r4
+        vhadd.u8        d2,  d1,  d18
+        vst1.32         {d1[0]},  [r1], r4
+        vext.8          d19, d21, d21, #1
+        vadd.u8         d2,  d2,  d6
+        vhadd.u8        d3,  d2,  d19
+        vst1.32         {d2[0]},  [r1], r4
+        vadd.u8         d3,  d3,  d7
+        vst1.32         {d3[0]},  [r1], r4
+        subs            r12, r12, #12
+        bgt             1b
+
+        pop             {r4,pc}
+endfunc
+
+.macro  paeth           rx,  ra,  rb,  rc
+        vaddl.u8        q12, \ra, \rb           @ a + b
+        vaddl.u8        q15, \rc, \rc           @ 2*c
+        vabdl.u8        q13, \rb, \rc           @ pa
+        vabdl.u8        q14, \ra, \rc           @ pb
+        vabd.u16        q15, q12, q15           @ pc
+        vcle.u16        q12, q13, q14           @ pa <= pb
+        vcle.u16        q13, q13, q15           @ pa <= pc
+        vcle.u16        q14, q14, q15           @ pb <= pc
+        vand            q12, q12, q13           @ pa <= pb && pa <= pc
+        vmovn.u16       d28, q14
+        vmovn.u16       \rx, q12
+        vbsl            d28, \rb, \rc
+        vbsl            \rx, \ra, d28
+.endm
+
+func    png_read_filter_row_paeth4_neon, export=1
+        ldr             r12, [r0, #4]           @ rowbytes
+        vmov.i8         d3,  #0
+        vmov.i8         d20, #0
+1:
+        vld4.32         {d4[],d5[],d6[],d7[]},    [r1,:128]
+        vld4.32         {d16[],d17[],d18[],d19[]},[r2,:128]!
+        paeth           d0,  d3,  d16, d20
+        vadd.u8         d0,  d0,  d4
+        paeth           d1,  d0,  d17, d16
+        vadd.u8         d1,  d1,  d5
+        paeth           d2,  d1,  d18, d17
+        vadd.u8         d2,  d2,  d6
+        paeth           d3,  d2,  d19, d18
+        vmov            d20, d19
+        vadd.u8         d3,  d3,  d7
+        vst4.32         {d0[0],d1[0],d2[0],d3[0]},[r1,:128]!
+        subs            r12, r12, #16
+        bgt             1b
+
+        bx              lr
+endfunc
+
+func    png_read_filter_row_paeth3_neon, export=1
+        push            {r4,lr}
+        ldr             r12, [r0, #4]           @ rowbytes
+        vmov.i8         d3,  #0
+        vmov.i8         d4,  #0
+        mov             r0,  r1
+        mov             r4,  #3
+        mov             lr,  #12
+        vld1.8          {q11},    [r0], lr
+1:
+        vld1.8          {q10},    [r2], lr
+        paeth           d0,  d3,  d20, d4
+        vext.8          d5,  d22, d23, #3
+        vadd.u8         d0,  d0,  d22
+        vext.8          d17, d20, d21, #3
+        paeth           d1,  d0,  d17, d20
+        vst1.32         {d0[0]},  [r1,:32], r4
+        vext.8          d6,  d22, d23, #6
+        vadd.u8         d1,  d1,  d5
+        vext.8          d18, d20, d21, #6
+        paeth           d2,  d1,  d18, d17
+        vext.8          d7,  d23, d23, #1
+        vld1.8          {q11},    [r0], lr
+        vst1.32         {d1[0]},  [r1], r4
+        vadd.u8         d2,  d2,  d6
+        vext.8          d19, d21, d21, #1
+        paeth           d3,  d2,  d19, d18
+        vst1.32         {d2[0]},  [r1], r4
+        vmov            d4,  d19
+        vadd.u8         d3,  d3,  d7
+        vst1.32         {d3[0]},  [r1], r4
+        subs            r12, r12, #12
+        bgt             1b
+
+        pop             {r4,pc}
+endfunc
diff --git a/pngpread.c b/pngpread.c
index 225fbda..56ee864 100644
--- a/pngpread.c
+++ b/pngpread.c
@@ -985,7 +985,7 @@
    if (png_ptr->row_buf[0] > PNG_FILTER_VALUE_NONE)
    {
       if (png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST)
-         png_read_filter_row(&row_info, png_ptr->row_buf + 1,
+         png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1,
             png_ptr->prev_row + 1, png_ptr->row_buf[0]);
       else
          png_error(png_ptr, "bad adaptive filter value");
diff --git a/pngpriv.h b/pngpriv.h
index ca36fe1..d0b7180 100644
--- a/pngpriv.h
+++ b/pngpriv.h
@@ -922,9 +922,24 @@
 /* Unfilter a row: check the filter value before calling this, there is no point
  * calling it for PNG_FILTER_VALUE_NONE.
  */
-PNG_EXTERN void png_read_filter_row PNGARG((png_row_infop row_info,
+PNG_EXTERN void png_read_filter_row PNGARG((png_structp pp, png_row_infop row_info,
     png_bytep row, png_const_bytep prev_row, int filter));
 
+PNG_EXTERN void png_read_filter_row_up_neon PNGARG((png_row_infop row_info,
+    png_bytep row, png_const_bytep prev_row));
+PNG_EXTERN void png_read_filter_row_sub3_neon PNGARG((png_row_infop row_info,
+    png_bytep row, png_const_bytep prev_row));
+PNG_EXTERN void png_read_filter_row_sub4_neon PNGARG((png_row_infop row_info,
+    png_bytep row, png_const_bytep prev_row));
+PNG_EXTERN void png_read_filter_row_avg3_neon PNGARG((png_row_infop row_info,
+    png_bytep row, png_const_bytep prev_row));
+PNG_EXTERN void png_read_filter_row_avg4_neon PNGARG((png_row_infop row_info,
+    png_bytep row, png_const_bytep prev_row));
+PNG_EXTERN void png_read_filter_row_paeth3_neon PNGARG((png_row_infop row_info,
+    png_bytep row, png_const_bytep prev_row));
+PNG_EXTERN void png_read_filter_row_paeth4_neon PNGARG((png_row_infop row_info,
+    png_bytep row, png_const_bytep prev_row));
+
 /* Choose the best filter to use and filter the row data */
 PNG_EXTERN void png_write_find_filter PNGARG((png_structp png_ptr,
     png_row_infop row_info));
diff --git a/pngread.c b/pngread.c
index 0f598fa..1b4ae03 100644
--- a/pngread.c
+++ b/pngread.c
@@ -578,7 +578,7 @@
    if (png_ptr->row_buf[0] > PNG_FILTER_VALUE_NONE)
    {
       if (png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST)
-         png_read_filter_row(&row_info, png_ptr->row_buf + 1,
+         png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1,
             png_ptr->prev_row + 1, png_ptr->row_buf[0]);
       else
          png_error(png_ptr, "bad adaptive filter value");
diff --git a/pngrutil.c b/pngrutil.c
index 7d80d5d..beb947b 100644
--- a/pngrutil.c
+++ b/pngrutil.c
@@ -3498,134 +3498,169 @@
 }
 #endif /* PNG_READ_INTERLACING_SUPPORTED */
 
-/* 1.5.6: Changed to just take a png_row_info (not png_ptr) and to ignore bad
- * adaptive filter bytes.
- */
-void /* PRIVATE */
-png_read_filter_row(png_row_infop row_info, png_bytep row,
-   png_const_bytep prev_row, int filter)
+static void
+png_read_filter_row_sub(png_row_infop row_info, png_bytep row,
+   png_const_bytep prev_row)
 {
-   switch (filter)
+   png_size_t i;
+   png_size_t istop = row_info->rowbytes;
+   unsigned int bpp = (row_info->pixel_depth + 7) >> 3;
+   png_bytep rp = row + bpp;
+   png_bytep lp = row;
+
+   PNG_UNUSED(prev_row)
+
+   for (i = bpp; i < istop; i++)
    {
-      case PNG_FILTER_VALUE_NONE:
-         break;
+      *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
+      rp++;
+   }
+}
 
-      case PNG_FILTER_VALUE_SUB:
-      {
-         png_size_t i;
-         png_size_t istop = row_info->rowbytes;
-         unsigned int bpp = (row_info->pixel_depth + 7) >> 3;
-         png_bytep rp = row + bpp;
-         png_bytep lp = row;
+static void
+png_read_filter_row_up(png_row_infop row_info, png_bytep row,
+   png_const_bytep prev_row)
+{
+   png_size_t i;
+   png_size_t istop = row_info->rowbytes;
+   png_bytep rp = row;
+   png_const_bytep pp = prev_row;
 
-         for (i = bpp; i < istop; i++)
-         {
-            *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
-            rp++;
-         }
-         break;
-      }
-      case PNG_FILTER_VALUE_UP:
-      {
-         png_size_t i;
-         png_size_t istop = row_info->rowbytes;
-         png_bytep rp = row;
-         png_const_bytep pp = prev_row;
+   for (i = 0; i < istop; i++)
+   {
+      *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
+      rp++;
+   }
+}
 
-         for (i = 0; i < istop; i++)
-         {
-            *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
-            rp++;
-         }
-         break;
-      }
-      case PNG_FILTER_VALUE_AVG:
-      {
-         png_size_t i;
-         png_bytep rp = row;
-         png_const_bytep pp = prev_row;
-         png_bytep lp = row;
-         unsigned int bpp = (row_info->pixel_depth + 7) >> 3;
-         png_size_t istop = row_info->rowbytes - bpp;
+static void
+png_read_filter_row_avg(png_row_infop row_info, png_bytep row,
+   png_const_bytep prev_row)
+{
+   png_size_t i;
+   png_bytep rp = row;
+   png_const_bytep pp = prev_row;
+   png_bytep lp = row;
+   unsigned int bpp = (row_info->pixel_depth + 7) >> 3;
+   png_size_t istop = row_info->rowbytes - bpp;
 
-         for (i = 0; i < bpp; i++)
-         {
-            *rp = (png_byte)(((int)(*rp) +
-                ((int)(*pp++) / 2 )) & 0xff);
+   for (i = 0; i < bpp; i++)
+   {
+      *rp = (png_byte)(((int)(*rp) +
+         ((int)(*pp++) / 2 )) & 0xff);
 
-            rp++;
-         }
+      rp++;
+   }
 
-         for (i = 0; i < istop; i++)
-         {
-            *rp = (png_byte)(((int)(*rp) +
-                (int)(*pp++ + *lp++) / 2 ) & 0xff);
+   for (i = 0; i < istop; i++)
+   {
+      *rp = (png_byte)(((int)(*rp) +
+         (int)(*pp++ + *lp++) / 2 ) & 0xff);
 
-            rp++;
-         }
-         break;
-      }
-      case PNG_FILTER_VALUE_PAETH:
-      {
-         png_size_t i;
-         png_bytep rp = row;
-         png_const_bytep pp = prev_row;
-         png_bytep lp = row;
-         png_const_bytep cp = prev_row;
-         unsigned int bpp = (row_info->pixel_depth + 7) >> 3;
-         png_size_t istop=row_info->rowbytes - bpp;
+      rp++;
+   }
+}
 
-         for (i = 0; i < bpp; i++)
-         {
-            *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
-            rp++;
-         }
+static void
+png_read_filter_row_paeth(png_row_infop row_info, png_bytep row,
+   png_const_bytep prev_row)
+{
+   png_size_t i;
+   png_bytep rp = row;
+   png_const_bytep pp = prev_row;
+   png_bytep lp = row;
+   png_const_bytep cp = prev_row;
+   unsigned int bpp = (row_info->pixel_depth + 7) >> 3;
+   png_size_t istop=row_info->rowbytes - bpp;
 
-         for (i = 0; i < istop; i++)   /* Use leftover rp,pp */
-         {
-            int a, b, c, pa, pb, pc, p;
+   for (i = 0; i < bpp; i++)
+   {
+      *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
+      rp++;
+   }
 
-            a = *lp++;
-            b = *pp++;
-            c = *cp++;
+   for (i = 0; i < istop; i++)   /* Use leftover rp,pp */
+   {
+      int a, b, c, pa, pb, pc, p;
 
-            p = b - c;
-            pc = a - c;
+      a = *lp++;
+      b = *pp++;
+      c = *cp++;
+
+      p = b - c;
+      pc = a - c;
 
 #ifdef PNG_USE_ABS
-            pa = abs(p);
-            pb = abs(pc);
-            pc = abs(p + pc);
+      pa = abs(p);
+      pb = abs(pc);
+      pc = abs(p + pc);
 #else
-            pa = p < 0 ? -p : p;
-            pb = pc < 0 ? -pc : pc;
-            pc = (p + pc) < 0 ? -(p + pc) : p + pc;
+      pa = p < 0 ? -p : p;
+      pb = pc < 0 ? -pc : pc;
+      pc = (p + pc) < 0 ? -(p + pc) : p + pc;
 #endif
 
-            /*
-               if (pa <= pb && pa <= pc)
-                  p = a;
+      /*
+        if (pa <= pb && pa <= pc)
+           p = a;
 
-               else if (pb <= pc)
-                  p = b;
+        else if (pb <= pc)
+           p = b;
 
-               else
-                  p = c;
-             */
+        else
+           p = c;
+      */
 
-            p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
+      p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
 
-            *rp = (png_byte)(((int)(*rp) + p) & 0xff);
-            rp++;
-         }
-         break;
-      }
-      default:
-         /* NOT REACHED */
-         break;
+      *rp = (png_byte)(((int)(*rp) + p) & 0xff);
+      rp++;
    }
 }
 
+#ifdef PNG_ARM_NEON
+static void
+png_init_filter_functions_neon(png_structp pp)
+{
+   unsigned int bpp = (pp->pixel_depth + 7) >> 3;
+
+   pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_neon;
+
+   if (bpp == 3) {
+      pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_neon;
+      pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_neon;
+      pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth3_neon;
+   } else if (bpp == 4) {
+      pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_neon;
+      pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_neon;
+      pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth4_neon;
+   }
+}
+#endif
+
+static void
+png_init_filter_functions(png_structp pp)
+{
+   pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub;
+   pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up;
+   pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg;
+   pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth;
+
+#ifdef PNG_ARM_NEON
+   png_init_filter_functions_neon(pp);
+#endif
+}
+
+void /* PRIVATE */
+png_read_filter_row(png_structp pp, png_row_infop row_info, png_bytep row,
+   png_const_bytep prev_row, int filter)
+{
+   if (pp->read_filter[0] == NULL)
+      png_init_filter_functions(pp);
+   if (filter > PNG_FILTER_VALUE_NONE && filter < PNG_FILTER_VALUE_LAST)
+      pp->read_filter[filter-1](row_info, row, prev_row);
+}
+
 #ifdef PNG_SEQUENTIAL_READ_SUPPORTED
 void /* PRIVATE */
 png_read_finish_row(png_structp png_ptr)
diff --git a/pngstruct.h b/pngstruct.h
index 33f2384..07f3a04 100644
--- a/pngstruct.h
+++ b/pngstruct.h
@@ -353,5 +353,8 @@
 
 /* New member added in libpng-1.5.6 */
    png_bytep big_prev_row;
+
+   void (*read_filter[PNG_FILTER_VALUE_LAST-1])(png_row_infop row_info,
+      png_bytep row, png_const_bytep prev_row);
 };
 #endif /* PNGSTRUCT_H */