Imported from libpng-1.2.20rc4.tar
diff --git a/ANNOUNCE b/ANNOUNCE
index 2c81353..0a7187c 100644
--- a/ANNOUNCE
+++ b/ANNOUNCE
@@ -1,5 +1,5 @@
 
-Libpng 1.2.20rc3 - August 30, 2007
+Libpng 1.2.20rc4 - September 1, 2007
 
 This is not intended to be a public release.  It will be replaced
 within a few weeks by a public version or by another test version.
@@ -9,32 +9,32 @@
 Source files with LF line endings (for Unix/Linux) and with a
 "configure" script
 
-   libpng-1.2.20rc3.tar.gz
-   libpng-1.2.20rc3.tar.bz2
+   libpng-1.2.20rc4.tar.gz
+   libpng-1.2.20rc4.tar.bz2
 
 Source files with LF line endings (for Unix/Linux) without the
 "configure" script
 
-   libpng-1.2.20rc3-no-config.tar.gz
-   libpng-1.2.20rc3-no-config.tar.bz2
+   libpng-1.2.20rc4-no-config.tar.gz
+   libpng-1.2.20rc4-no-config.tar.bz2
 
 Source files with CRLF line endings (for Windows), without the
 "configure" script
 
-   lp1220r03.zip
-   lp1220r03.tar.bz2
+   lp1220r04.zip
+   lp1220r04.tar.bz2
 
 Project files
 
-   libpng-1.2.20rc3-project-netware.zip
-   libpng-1.2.20rc3-project-wince.zip
+   libpng-1.2.20rc4-project-netware.zip
+   libpng-1.2.20rc4-project-wince.zip
 
 Other information:
 
-   libpng-1.2.20rc3-README.txt
-   libpng-1.2.20rc3-KNOWNBUGS.txt
-   libpng-1.2.20rc3-LICENSE.txt
-   libpng-1.2.20rc3-Y2K-compliance.txt
+   libpng-1.2.20rc4-README.txt
+   libpng-1.2.20rc4-KNOWNBUGS.txt
+   libpng-1.2.20rc4-LICENSE.txt
+   libpng-1.2.20rc4-Y2K-compliance.txt
 
 Changes since the last public release (1.2.19):
 
@@ -62,13 +62,16 @@
   Revised #ifdefs to ensure one and only one of pnggccrd.c, pngvcrd.c,
     or part of pngrutil.c is selected.
 
-version 1.2.20rc3 [August 30, 2007]
+version 1.2.20rc3 [September 1, 2007]
   Remove a little more code in pngwutil.c when PNG_NO_WRITE_FILTER is selected.
   Added /D _CRT_SECURE_NO_WARNINGS to visual6c and visualc71 projects.
   Compile png_mmx_support() in png.c even when PNG_NO_MMX_CODE is defined.
   Restored a "superfluous" #ifdef that was removed from 1.2.20rc2 pnggccrd.c,
     breaking the png_mmx_support() function.
 
+version 1.2.20rc4 [September 1, 2007]
+  Removed Intel contributions (MMX, Optimized C).
+
 Send comments/corrections/commendations to png-mng-implement at lists.sf.net
 
 (subscription required; visit 
diff --git a/CHANGES b/CHANGES
index e09eeb3..ffb954e 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1926,6 +1926,9 @@
   Restored a "superfluous" #ifdef that was removed from 1.2.20rc2 pnggccrd.c,
     breaking the png_mmx_support() function.
 
+version 1.2.20rc4 [September 1, 2007]
+  Removed Intel contributions (MMX, Optimized C).
+
 Send comments/corrections/commendations to png-mng-implement at lists.sf.net
 (subscription required; visit
 https://lists.sourceforge.net/lists/listinfo/png-mng-implement
diff --git a/INSTALL b/INSTALL
index ed8b513..75f58e8 100644
--- a/INSTALL
+++ b/INSTALL
@@ -1,5 +1,5 @@
 
-Installing libpng version 1.2.20rc3 - August 30, 2007
+Installing libpng version 1.2.20rc4 - September 1, 2007
 
 On Unix/Linux and similar systems, you can simply type
 
@@ -44,7 +44,7 @@
 correspond to the version of zlib that's installed.
 
 You can rename the directories that you downloaded (they
-might be called "libpng-1.2.20rc3" or "lpng109" and "zlib-1.2.1"
+might be called "libpng-1.2.20rc4" or "lpng109" and "zlib-1.2.1"
 or "zlib121") so that you have directories called "zlib" and "libpng".
 
 Your directory structure should look like this:
@@ -101,15 +101,9 @@
  CMakeLists.txt    =>  "cmake" script
  makefile.std      =>  Generic UNIX makefile (cc, creates static libpng.a)
  makefile.elf      =>  Linux/ELF makefile symbol versioning,
-                       gcc, creates libpng12.so.0.1.2.20rc3)
+                       gcc, creates libpng12.so.0.1.2.20rc4)
  makefile.linux    =>  Linux/ELF makefile
-                       (gcc, creates libpng12.so.0.1.2.20rc3)
- makefile.gcmmx    =>  Linux/ELF makefile
-                       (gcc, creates libpng12.so.0.1.2.20rc3,
-                       uses assembler code tuned for Intel MMX platform)
- makefile.nommx    =>  Linux/ELF makefile
-                       (gcc, creates libpng12.so.0.1.2.20rc3
-                       does not use Intel MMX assembler code)
+                       (gcc, creates libpng12.so.0.1.2.20rc4)
  makefile.gcc      =>  Generic makefile (gcc, creates static libpng.a)
  makefile.knr      =>  Archaic UNIX Makefile that converts files with
                        ansi2knr (Requires ansi2knr.c from
@@ -131,14 +125,14 @@
  makefile.openbsd  =>  OpenBSD makefile
  makefile.sgi      =>  Silicon Graphics IRIX makefile (cc, creates static lib)
  makefile.sggcc    =>  Silicon Graphics (gcc,
-                       creates libpng12.so.0.1.2.20rc3)
+                       creates libpng12.so.0.1.2.20rc4)
  makefile.sunos    =>  Sun makefile
  makefile.solaris  =>  Solaris 2.X makefile (gcc,
-                       creates libpng12.so.0.1.2.20rc3)
+                       creates libpng12.so.0.1.2.20rc4)
  makefile.solaris-x86 =>  Solaris/intelMMX 2.X makefile (gcc,
-                       creates libpng12.so.0.1.2.20rc3)
+                       creates libpng12.so.0.1.2.20rc4)
  makefile.so9      =>  Solaris 9 makefile (gcc,
-                       creates libpng12.so.0.1.2.20rc3)
+                       creates libpng12.so.0.1.2.20rc4)
  makefile.32sunu   =>  Sun Ultra 32-bit makefile
  makefile.64sunu   =>  Sun Ultra 64-bit makefile
  makefile.sco      =>  For SCO OSr5  ELF and Unixware 7 with Native cc
@@ -154,10 +148,7 @@
  makefile.tc3      =>  Turbo C 3.0 makefile
  makefile.dj2      =>  DJGPP 2 makefile
  makefile.msc      =>  Microsoft C makefile
- makefile.vcawin32 =>  makefile for Microsoft Visual C++ 5.0 and later (uses
-                       assembler code tuned for Intel MMX platform)
- makefile.vcwin32  =>  makefile for Microsoft Visual C++ 4.0 and later (does
-                       not use assembler code)
+ makefile.vcwin32  =>  makefile for Microsoft Visual C++ 4.0 and later
  makefile.os2      =>  OS/2 Makefile (gcc and emx, requires pngos2.def)
  pngos2.def        =>  OS/2 module definition file used by makefile.os2
  makefile.watcom   =>  Watcom 10a+ Makefile, 32-bit flat memory model
diff --git a/KNOWNBUG b/KNOWNBUG
index 0245123..b1a946c 100644
--- a/KNOWNBUG
+++ b/KNOWNBUG
@@ -1,5 +1,5 @@
 
-Known bugs in libpng version 1.2.20rc3
+Known bugs in libpng version 1.2.20rc4
 
 1. April 22, 2001: pnggccrd.c has been reported to crash on NetBSD when
    reading interlaced PNG files, when assembler code is enabled but running
diff --git a/LICENSE b/LICENSE
index 9f9b122..1501152 100644
--- a/LICENSE
+++ b/LICENSE
@@ -8,7 +8,7 @@
 If you modify libpng you may insert additional notices immediately following
 this sentence.
 
-libpng versions 1.2.6, August 15, 2004, through 1.2.20rc3, August 30, 2007, are
+libpng versions 1.2.6, August 15, 2004, through 1.2.20rc4, September 1, 2007, are
 Copyright (c) 2004, 2006-2007 Glenn Randers-Pehrson, and are
 distributed according to the same disclaimer and license as libpng-1.2.5
 with the following individual added to the list of Contributing Authors
@@ -106,4 +106,4 @@
 
 Glenn Randers-Pehrson
 glennrp at users.sourceforge.net
-August 30, 2007
+September 1, 2007
diff --git a/Makefile.am b/Makefile.am
index de7dd31..549c2c7 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -33,7 +33,7 @@
 EXTRA_LTLIBRARIES= libpng.la
 libpng12_la_SOURCES = png.c pngset.c pngget.c pngrutil.c pngtrans.c pngwutil.c \
 	pngread.c pngrio.c pngwio.c pngwrite.c pngrtran.c \
-	pngwtran.c pngmem.c pngerror.c pngpread.c pnggccrd.c \
+	pngwtran.c pngmem.c pngerror.c pngpread.c \
 	png.h pngconf.h
 libpng_la_SOURCES = $(libpng12_la_SOURCES)
 
@@ -83,7 +83,7 @@
 	${srcdir}/contrib/pngsuite/* \
 	${srcdir}/contrib/visupng/* \
 	$(TESTS) \
-	example.c libpng.txt pngvcrd.c 
+	example.c libpng.txt
 
 CLEANFILES= pngout.png libpng12.pc libpng12-config libpng.vers \
 libpng.sym
diff --git a/Makefile.in b/Makefile.in
index 41913eb..a56378a 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -87,8 +87,7 @@
 	libpng_la-pngread.lo libpng_la-pngrio.lo libpng_la-pngwio.lo \
 	libpng_la-pngwrite.lo libpng_la-pngrtran.lo \
 	libpng_la-pngwtran.lo libpng_la-pngmem.lo \
-	libpng_la-pngerror.lo libpng_la-pngpread.lo \
-	libpng_la-pnggccrd.lo
+	libpng_la-pngerror.lo libpng_la-pngpread.lo
 am_libpng_la_OBJECTS = $(am__objects_1)
 libpng_la_OBJECTS = $(am_libpng_la_OBJECTS)
 libpng12_la_LIBADD =
@@ -99,7 +98,7 @@
 	libpng12_la-pngwio.lo libpng12_la-pngwrite.lo \
 	libpng12_la-pngrtran.lo libpng12_la-pngwtran.lo \
 	libpng12_la-pngmem.lo libpng12_la-pngerror.lo \
-	libpng12_la-pngpread.lo libpng12_la-pnggccrd.lo
+	libpng12_la-pngpread.lo
 libpng12_la_OBJECTS = $(am_libpng12_la_OBJECTS)
 am_pngtest_OBJECTS = pngtest.$(OBJEXT)
 pngtest_OBJECTS = $(am_pngtest_OBJECTS)
@@ -184,7 +183,6 @@
 LDFLAGS = @LDFLAGS@
 LIBOBJS = @LIBOBJS@
 LIBPNG_DEFINES = @LIBPNG_DEFINES@
-LIBPNG_NO_MMX = @LIBPNG_NO_MMX@
 LIBS = @LIBS@
 LIBTOOL = @LIBTOOL@
 LN_S = @LN_S@
@@ -286,7 +284,7 @@
 EXTRA_LTLIBRARIES = libpng.la
 libpng12_la_SOURCES = png.c pngset.c pngget.c pngrutil.c pngtrans.c pngwutil.c \
 	pngread.c pngrio.c pngwio.c pngwrite.c pngrtran.c \
-	pngwtran.c pngmem.c pngerror.c pngpread.c pnggccrd.c \
+	pngwtran.c pngmem.c pngerror.c pngpread.c \
 	png.h pngconf.h
 
 libpng_la_SOURCES = $(libpng12_la_SOURCES)
@@ -322,7 +320,7 @@
 	${srcdir}/contrib/pngsuite/* \
 	${srcdir}/contrib/visupng/* \
 	$(TESTS) \
-	example.c libpng.txt pngvcrd.c 
+	example.c libpng.txt
 
 CLEANFILES = pngout.png libpng12.pc libpng12-config libpng.vers \
 libpng.sym
@@ -455,7 +453,6 @@
 
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng12_la-png.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng12_la-pngerror.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng12_la-pnggccrd.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng12_la-pngget.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng12_la-pngmem.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng12_la-pngpread.Plo@am__quote@
@@ -471,7 +468,6 @@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng12_la-pngwutil.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-png.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-pngerror.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-pnggccrd.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-pngget.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-pngmem.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-pngpread.Plo@am__quote@
@@ -613,13 +609,6 @@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(LIBTOOL) --mode=compile --tag=CC $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpng_la-pngpread.lo `test -f 'pngpread.c' || echo '$(srcdir)/'`pngpread.c
 
-libpng_la-pnggccrd.lo: pnggccrd.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --mode=compile --tag=CC $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpng_la-pnggccrd.lo -MD -MP -MF "$(DEPDIR)/libpng_la-pnggccrd.Tpo" -c -o libpng_la-pnggccrd.lo `test -f 'pnggccrd.c' || echo '$(srcdir)/'`pnggccrd.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpng_la-pnggccrd.Tpo" "$(DEPDIR)/libpng_la-pnggccrd.Plo"; else rm -f "$(DEPDIR)/libpng_la-pnggccrd.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pnggccrd.c' object='libpng_la-pnggccrd.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --mode=compile --tag=CC $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpng_la-pnggccrd.lo `test -f 'pnggccrd.c' || echo '$(srcdir)/'`pnggccrd.c
-
 libpng12_la-png.lo: png.c
 @am__fastdepCC_TRUE@	if $(LIBTOOL) --mode=compile --tag=CC $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng12_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpng12_la-png.lo -MD -MP -MF "$(DEPDIR)/libpng12_la-png.Tpo" -c -o libpng12_la-png.lo `test -f 'png.c' || echo '$(srcdir)/'`png.c; \
 @am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpng12_la-png.Tpo" "$(DEPDIR)/libpng12_la-png.Plo"; else rm -f "$(DEPDIR)/libpng12_la-png.Tpo"; exit 1; fi
@@ -725,13 +714,6 @@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(LIBTOOL) --mode=compile --tag=CC $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng12_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpng12_la-pngpread.lo `test -f 'pngpread.c' || echo '$(srcdir)/'`pngpread.c
 
-libpng12_la-pnggccrd.lo: pnggccrd.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --mode=compile --tag=CC $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng12_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpng12_la-pnggccrd.lo -MD -MP -MF "$(DEPDIR)/libpng12_la-pnggccrd.Tpo" -c -o libpng12_la-pnggccrd.lo `test -f 'pnggccrd.c' || echo '$(srcdir)/'`pnggccrd.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpng12_la-pnggccrd.Tpo" "$(DEPDIR)/libpng12_la-pnggccrd.Plo"; else rm -f "$(DEPDIR)/libpng12_la-pnggccrd.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pnggccrd.c' object='libpng12_la-pnggccrd.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --mode=compile --tag=CC $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng12_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpng12_la-pnggccrd.lo `test -f 'pnggccrd.c' || echo '$(srcdir)/'`pnggccrd.c
-
 mostlyclean-libtool:
 	-rm -f *.lo
 
diff --git a/README b/README
index dec2d45..6ae9daf 100644
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
-README for libpng version 1.2.20rc3 - August 30, 2007 (shared library 12.0)
+README for libpng version 1.2.20rc4 - September 1, 2007 (shared library 12.0)
 See the note about version numbers near the top of png.h
 
 See INSTALL for instructions on how to install libpng.
@@ -190,11 +190,11 @@
        descrip.mms      =>  VMS makefile for MMS or MMK
        makefile.std     =>  Generic UNIX makefile (cc, creates static libpng.a)
        makefile.elf     =>  Linux/ELF makefile symbol versioning,
-                            gcc, creates libpng12.so.0.1.2.20rc3)
+                            gcc, creates libpng12.so.0.1.2.20rc4)
        makefile.linux   =>  Linux/ELF makefile
-                            (gcc, creates libpng12.so.0.1.2.20rc3)
+                            (gcc, creates libpng12.so.0.1.2.20rc4)
        makefile.gcmmx   =>  Linux/ELF makefile
-                            (gcc, creates libpng12.so.0.1.2.20rc3,
+                            (gcc, creates libpng12.so.0.1.2.20rc4,
                             uses assembler code tuned for Intel MMX platform)
        makefile.gcc     =>  Generic makefile (gcc, creates static libpng.a)
        makefile.knr     =>  Archaic UNIX Makefile that converts files with
@@ -216,12 +216,12 @@
        makefile.openbsd =>  OpenBSD makefile
        makefile.sgi     =>  Silicon Graphics IRIX (cc, creates static lib)
        makefile.sggcc   =>  Silicon Graphics
-                            (gcc, creates libpng12.so.0.1.2.20rc3)
+                            (gcc, creates libpng12.so.0.1.2.20rc4)
        makefile.sunos   =>  Sun makefile
        makefile.solaris =>  Solaris 2.X makefile
-                            (gcc, creates libpng12.so.0.1.2.20rc3)
+                            (gcc, creates libpng12.so.0.1.2.20rc4)
        makefile.so9     =>  Solaris 9 makefile
-                            (gcc, creates libpng12.so.0.1.2.20rc3)
+                            (gcc, creates libpng12.so.0.1.2.20rc4)
        makefile.32sunu  =>  Sun Ultra 32-bit makefile
        makefile.64sunu  =>  Sun Ultra 64-bit makefile
        makefile.sco     =>  For SCO OSr5  ELF and Unixware 7 with Native cc
diff --git a/Y2KINFO b/Y2KINFO
index af33e0e..52cb27b 100644
--- a/Y2KINFO
+++ b/Y2KINFO
@@ -1,13 +1,13 @@
    Y2K compliance in libpng:
    =========================
 
-      August 30, 2007
+      September 1, 2007
 
       Since the PNG Development group is an ad-hoc body, we can't make
       an official declaration.
 
       This is your unofficial assurance that libpng from version 0.71 and
-      upward through 1.2.20rc3 are Y2K compliant.  It is my belief that earlier
+      upward through 1.2.20rc4 are Y2K compliant.  It is my belief that earlier
       versions were also Y2K compliant.
 
       Libpng only has three year fields.  One is a 2-byte unsigned integer
diff --git a/configure b/configure
index c60ff40..bfaf636 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.61 for libpng 1.2.20rc3.
+# Generated by GNU Autoconf 2.61 for libpng 1.2.20rc4.
 #
 # Report bugs to <png-mng-implement@lists.sourceforge.net>.
 #
@@ -728,8 +728,8 @@
 # Identity of this package.
 PACKAGE_NAME='libpng'
 PACKAGE_TARNAME='libpng'
-PACKAGE_VERSION='1.2.20rc3'
-PACKAGE_STRING='libpng 1.2.20rc3'
+PACKAGE_VERSION='1.2.20rc4'
+PACKAGE_STRING='libpng 1.2.20rc4'
 PACKAGE_BUGREPORT='png-mng-implement@lists.sourceforge.net'
 
 ac_unique_file="pngget.c"
@@ -876,7 +876,6 @@
 POW_LIB
 LIBOBJS
 LIBPNG_DEFINES
-LIBPNG_NO_MMX
 HAVE_LD_VERSION_SCRIPT_TRUE
 HAVE_LD_VERSION_SCRIPT_FALSE
 PNGLIB_VERSION
@@ -1405,7 +1404,7 @@
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures libpng 1.2.20rc3 to adapt to many kinds of systems.
+\`configure' configures libpng 1.2.20rc4 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1475,7 +1474,7 @@
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of libpng 1.2.20rc3:";;
+     short | recursive ) echo "Configuration of libpng 1.2.20rc4:";;
    esac
   cat <<\_ACEOF
 
@@ -1585,7 +1584,7 @@
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-libpng configure 1.2.20rc3
+libpng configure 1.2.20rc4
 generated by GNU Autoconf 2.61
 
 Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@@ -1599,7 +1598,7 @@
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by libpng $as_me 1.2.20rc3, which was
+It was created by libpng $as_me 1.2.20rc4, which was
 generated by GNU Autoconf 2.61.  Invocation command line was
 
   $ $0 $@
@@ -2269,7 +2268,7 @@
 
 # Define the identity of the package.
  PACKAGE='libpng'
- VERSION='1.2.20rc3'
+ VERSION='1.2.20rc4'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -2440,7 +2439,7 @@
 
 
 
-PNGLIB_VERSION=1.2.20rc3
+PNGLIB_VERSION=1.2.20rc4
 PNGLIB_MAJOR=1
 PNGLIB_MINOR=2
 PNGLIB_RELEASE=20
@@ -4788,7 +4787,7 @@
   ;;
 *-*-irix6*)
   # Find out which ABI we are using.
-  echo '#line 4791 "configure"' > conftest.$ac_ext
+  echo '#line 4790 "configure"' > conftest.$ac_ext
   if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5
   (eval $ac_compile) 2>&5
   ac_status=$?
@@ -7301,11 +7300,11 @@
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:7304: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:7303: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:7308: \$? = $ac_status" >&5
+   echo "$as_me:7307: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -7591,11 +7590,11 @@
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:7594: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:7593: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:7598: \$? = $ac_status" >&5
+   echo "$as_me:7597: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -7695,11 +7694,11 @@
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:7698: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:7697: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>out/conftest.err)
    ac_status=$?
    cat out/conftest.err >&5
-   echo "$as_me:7702: \$? = $ac_status" >&5
+   echo "$as_me:7701: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s out/conftest2.$ac_objext
    then
      # The compiler can only warn and ignore the option if not recognized
@@ -10044,7 +10043,7 @@
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<EOF
-#line 10047 "configure"
+#line 10046 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -10144,7 +10143,7 @@
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<EOF
-#line 10147 "configure"
+#line 10146 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -12564,11 +12563,11 @@
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:12567: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:12566: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:12571: \$? = $ac_status" >&5
+   echo "$as_me:12570: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -12668,11 +12667,11 @@
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:12671: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:12670: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>out/conftest.err)
    ac_status=$?
    cat out/conftest.err >&5
-   echo "$as_me:12675: \$? = $ac_status" >&5
+   echo "$as_me:12674: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s out/conftest2.$ac_objext
    then
      # The compiler can only warn and ignore the option if not recognized
@@ -14230,11 +14229,11 @@
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:14233: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:14232: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:14237: \$? = $ac_status" >&5
+   echo "$as_me:14236: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -14334,11 +14333,11 @@
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:14337: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:14336: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>out/conftest.err)
    ac_status=$?
    cat out/conftest.err >&5
-   echo "$as_me:14341: \$? = $ac_status" >&5
+   echo "$as_me:14340: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s out/conftest2.$ac_objext
    then
      # The compiler can only warn and ignore the option if not recognized
@@ -16521,11 +16520,11 @@
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:16524: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:16523: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:16528: \$? = $ac_status" >&5
+   echo "$as_me:16527: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -16811,11 +16810,11 @@
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:16814: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:16813: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:16818: \$? = $ac_status" >&5
+   echo "$as_me:16817: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -16915,11 +16914,11 @@
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:16918: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:16917: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>out/conftest.err)
    ac_status=$?
    cat out/conftest.err >&5
-   echo "$as_me:16922: \$? = $ac_status" >&5
+   echo "$as_me:16921: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s out/conftest2.$ac_objext
    then
      # The compiler can only warn and ignore the option if not recognized
@@ -20713,55 +20712,6 @@
 
 
 LIBPNG_DEFINES=-DPNG_CONFIGURE_LIBPNG
-{ echo "$as_me:$LINENO: checking if assembler code in pnggccrd.c can be compiled without PNG_NO_MMX_CODE" >&5
-echo $ECHO_N "checking if assembler code in pnggccrd.c can be compiled without PNG_NO_MMX_CODE... $ECHO_C" >&6; }
-cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-#include "$srcdir/pnggccrd.c"
-int
-main ()
-{
-return 0;
-  ;
-  return 0;
-}
-_ACEOF
-rm -f conftest.$ac_objext
-if { (ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_compile") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } && {
-	 test -z "$ac_c_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest.$ac_objext; then
-  { echo "$as_me:$LINENO: result: yes" >&5
-echo "${ECHO_T}yes" >&6; }
-  LIBPNG_NO_MMX=""
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	{ echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-  LIBPNG_NO_MMX=-DPNG_NO_MMX_CODE
-fi
-
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-LIBPNG_DEFINES=$LIBPNG_DEFINES\ $LIBPNG_NO_MMX
-
 
 
 { echo "$as_me:$LINENO: checking if libraries can be versioned" >&5
@@ -21282,7 +21232,7 @@
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by libpng $as_me 1.2.20rc3, which was
+This file was extended by libpng $as_me 1.2.20rc4, which was
 generated by GNU Autoconf 2.61.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -21335,7 +21285,7 @@
 _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF
 ac_cs_version="\\
-libpng config.status 1.2.20rc3
+libpng config.status 1.2.20rc4
 configured by $0, generated by GNU Autoconf 2.61,
   with options \\"`echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
 
@@ -21663,7 +21613,6 @@
 POW_LIB!$POW_LIB$ac_delim
 LIBOBJS!$LIBOBJS$ac_delim
 LIBPNG_DEFINES!$LIBPNG_DEFINES$ac_delim
-LIBPNG_NO_MMX!$LIBPNG_NO_MMX$ac_delim
 HAVE_LD_VERSION_SCRIPT_TRUE!$HAVE_LD_VERSION_SCRIPT_TRUE$ac_delim
 HAVE_LD_VERSION_SCRIPT_FALSE!$HAVE_LD_VERSION_SCRIPT_FALSE$ac_delim
 PNGLIB_VERSION!$PNGLIB_VERSION$ac_delim
@@ -21676,7 +21625,7 @@
 LTLIBOBJS!$LTLIBOBJS$ac_delim
 _ACEOF
 
-  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 21; then
+  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 20; then
     break
   elif $ac_last_try; then
     { { echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
diff --git a/configure.ac b/configure.ac
index c7a98e2..244952c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -18,12 +18,12 @@
 
 dnl Version number stuff here:
 
-AC_INIT([libpng], [1.2.20rc3], [png-mng-implement@lists.sourceforge.net])
+AC_INIT([libpng], [1.2.20rc4], [png-mng-implement@lists.sourceforge.net])
 AM_INIT_AUTOMAKE
 dnl stop configure from automagically running automake
 AM_MAINTAINER_MODE
 
-PNGLIB_VERSION=1.2.20rc3
+PNGLIB_VERSION=1.2.20rc4
 PNGLIB_MAJOR=1
 PNGLIB_MINOR=2
 PNGLIB_RELEASE=20
@@ -59,18 +59,7 @@
 AC_CHECK_LIB(z, zlibVersion, , AC_ERROR([zlib not installed]))
 
 LIBPNG_DEFINES=-DPNG_CONFIGURE_LIBPNG
-AC_MSG_CHECKING(
-  [if assembler code in pnggccrd.c can be compiled without PNG_NO_MMX_CODE])
-AC_TRY_COMPILE(
-  [#include "$srcdir/pnggccrd.c"],
-  [return 0;],
-  AC_MSG_RESULT(yes)
-  LIBPNG_NO_MMX="",
-  AC_MSG_RESULT(no)
-  LIBPNG_NO_MMX=-DPNG_NO_MMX_CODE)
-LIBPNG_DEFINES=$LIBPNG_DEFINES\ $LIBPNG_NO_MMX
 AC_SUBST(LIBPNG_DEFINES)
-AC_SUBST(LIBPNG_NO_MMX)
 
 AC_MSG_CHECKING([if libraries can be versioned])
 GLD=`$LD --help < /dev/null 2>/dev/null | grep version-script`
diff --git a/libpng-1.2.20rc3.txt b/libpng-1.2.20rc4.txt
similarity index 93%
rename from libpng-1.2.20rc3.txt
rename to libpng-1.2.20rc4.txt
index ce05124..ba4d9ab 100644
--- a/libpng-1.2.20rc3.txt
+++ b/libpng-1.2.20rc4.txt
@@ -1,6 +1,6 @@
 libpng.txt - A description on how to use and modify libpng
 
- libpng version 1.2.20rc3 - August 30, 2007
+ libpng version 1.2.20rc4 - September 1, 2007
  Updated and distributed by Glenn Randers-Pehrson
  <glennrp at users.sourceforge.net>
  Copyright (c) 1998-2007 Glenn Randers-Pehrson
@@ -86,9 +86,7 @@
 instances of the structures.  Each thread should have its own
 png_struct and png_info instances, and thus its own image.
 Libpng does not protect itself against two threads using the
-same instance of a structure.  Note: thread safety may be defeated
-by use of some of the MMX assembler code in pnggccrd.c, which is only
-compiled when the user defines PNG_THREAD_UNSAFE_OK.
+same instance of a structure.
 
 II. Structures
 
@@ -2355,17 +2353,13 @@
 
 V. Modifying/Customizing libpng:
 
-There are three issues here.  The first is changing how libpng does
+There are two issues here.  The first is changing how libpng does
 standard things like memory allocation, input/output, and error handling.
 The second deals with more complicated things like adding new chunks,
 adding new transformations, and generally changing how libpng works.
 Both of those are compile-time issues; that is, they are generally
 determined at the time the code is written, and there is rarely a need
-to provide the user with a means of changing them.  The third is a
-run-time issue:  choosing between and/or tuning one or more alternate
-versions of computationally intensive routines; specifically, optimized
-assembly-language (and therefore compiler- and platform-dependent)
-versions.
+to provide the user with a means of changing them.
 
 Memory allocation, input/output, and error handling
 
@@ -2726,172 +2720,6 @@
 having level = 0 will be printed.  There aren't any such statements in
 this version of libpng, but if you insert some they will be printed.
 
-VI.  Runtime optimization
-
-A new feature in libpng 1.2.0 is the ability to dynamically switch between
-standard and optimized versions of some routines.  Currently these are
-limited to three computationally intensive tasks when reading PNG files:
-decoding row filters, expanding interlacing, and combining interlaced or
-transparent row data with previous row data.  Currently the optimized
-versions are available only for x86 (Intel, AMD, etc.) platforms with
-MMX support, though this may change in future versions.  (For example,
-the non-MMX assembler optimizations for zlib might become similarly
-runtime-selectable in future releases, in which case libpng could be
-extended to support them.  Alternatively, the compile-time choice of
-floating-point versus integer routines for gamma correction might become
-runtime-selectable.)
-
-Because such optimizations tend to be very platform- and compiler-dependent,
-both in how they are written and in how they perform, the new runtime code
-in libpng has been written to allow programs to query, enable, and disable
-either specific optimizations or all such optimizations.  For example, to
-enable all possible optimizations (bearing in mind that some "optimizations"
-may actually run more slowly in rare cases):
-
-    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200)
-       png_uint_32 mask, flags;
-
-       flags = png_get_asm_flags(png_ptr);
-       mask = png_get_asm_flagmask(PNG_SELECT_READ | PNG_SELECT_WRITE);
-       png_set_asm_flags(png_ptr, flags | mask);
-    #endif
-
-To enable only optimizations relevant to reading PNGs, use PNG_SELECT_READ
-by itself when calling png_get_asm_flagmask(); similarly for optimizing
-only writing.  To disable all optimizations:
-
-    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200)
-       flags = png_get_asm_flags(png_ptr);
-       mask = png_get_asm_flagmask(PNG_SELECT_READ | PNG_SELECT_WRITE);
-       png_set_asm_flags(png_ptr, flags & ~mask);
-    #endif
-
-To enable or disable only MMX-related features, use png_get_mmx_flagmask()
-in place of png_get_asm_flagmask().  The mmx version takes one additional
-parameter:
-
-    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200)
-       int selection = PNG_SELECT_READ | PNG_SELECT_WRITE;
-       int compilerID;
-
-       mask = png_get_mmx_flagmask(selection, &compilerID);
-    #endif
-
-On return, compilerID will indicate which version of the MMX assembler
-optimizations was compiled.  Currently two flavors exist:  Microsoft
-Visual C++ (compilerID == 1) and GNU C (a.k.a. gcc/gas, compilerID == 2).
-On non-x86 platforms or on systems compiled without MMX optimizations, a
-value of -1 is used.
-
-Note that both png_get_asm_flagmask() and png_get_mmx_flagmask() return
-all valid, settable optimization bits for the version of the library that's
-currently in use.  In the case of shared (dynamically linked) libraries,
-this may include optimizations that did not exist at the time the code was
-written and compiled.  It is also possible, of course, to enable only known,
-specific optimizations; for example:
-
-    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200) && \
-       defined(PNG_USE_PNGGCCRD) && defined(PNG_ASSEMBLER_CODE_SUPPORTED)
-       flags = PNG_ASM_FLAG_MMX_READ_COMBINE_ROW  \
-             | PNG_ASM_FLAG_MMX_READ_INTERLACE    \
-             | PNG_ASM_FLAG_MMX_READ_FILTER_SUB   \
-             | PNG_ASM_FLAG_MMX_READ_FILTER_UP    \
-             | PNG_ASM_FLAG_MMX_READ_FILTER_AVG   \
-             | PNG_ASM_FLAG_MMX_READ_FILTER_PAETH ;
-       png_set_asm_flags(png_ptr, flags);
-    #endif
-
-This method would enable only the MMX read-optimizations available at the
-time of libpng 1.2.0's release, regardless of whether a later version of
-the DLL were actually being used.  (Also note that these functions did not
-exist in versions older than 1.2.0, so any attempt to run a dynamically
-linked app on such an older version would fail.)
-
-To determine whether the processor supports MMX instructions at all, use
-the png_mmx_support() function:
-
-    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200)
-       mmxsupport = png_mmx_support();
-    #endif
-
-It returns -1 if MMX support is not compiled into libpng, 0 if MMX code
-is compiled but MMX is not supported by the processor, or 1 if MMX support
-is fully available.  Note that png_mmx_support(), png_get_mmx_flagmask(),
-and png_get_asm_flagmask() all may be called without allocating and ini-
-tializing any PNG structures (for example, as part of a usage screen or
-"about" box).
-
-The following code can be used to prevent an application from using the
-thread_unsafe features, even if libpng was built with PNG_THREAD_UNSAFE_OK
-defined:
-
-#if defined(PNG_USE_PNGGCCRD) && defined(PNG_ASSEMBLER_CODE_SUPPORTED) \
-  && defined(PNG_THREAD_UNSAFE_OK)
-    /* Disable thread-unsafe features of pnggccrd */
-    if (png_access_version_number() >= 10200)
-    {
-      png_uint_32 mmx_disable_mask = 0;
-      png_uint_32 asm_flags;
-
-      mmx_disable_mask |= ( PNG_ASM_FLAG_MMX_READ_COMBINE_ROW  \
-                          | PNG_ASM_FLAG_MMX_READ_FILTER_SUB   \
-                          | PNG_ASM_FLAG_MMX_READ_FILTER_AVG   \
-                          | PNG_ASM_FLAG_MMX_READ_FILTER_PAETH );
-      asm_flags = png_get_asm_flags(png_ptr);
-      png_set_asm_flags(png_ptr, asm_flags & ~mmx_disable_mask);
-    }
-#endif
-
-For more extensive examples of runtime querying, enabling and disabling
-of optimized features, see contrib/gregbook/readpng2.c in the libpng
-source-code distribution.
-
-It is also possible to disable or enable specific optimization features
-at compile time.  To disable them entirely, which may result in slower
-but smaller and less complex and troublesome code, define
-
-     PNG_NO_ASSEMBLER_CODE
-
-If you do this, then the run-time code for setting and querying flags
-described above, and the assembler code in pnggccrd.c or pngvcrd.c will
-not be built.
-
-If you have disabled the assembler code, you can also disable the
-optimized C code, to obtain even slower but smaller code, by defining
-
-     PNG_NO_OPTIMZED_CODE
-
-To disable only the MMX assembler code, define
-
-     PNG_NO_MMX_CODE
-
-There are two versions of the MMX code: one for gcc compilers and one for
-MSVC compilers.  Pngconf.h should automatically detect which you are
-using, and it will set either PNG_USE_PNGGCCRD or PNG_USE_PNGVCRD for you.
-
-Define one or more of the following to disable specific parts of the
-assembler code:
-
-     PNG_NO_MMX_COMBINE_ROW
-     PNG_NO_MMX_READ_INTERLACE
-     PNG_NO_MMX_READ_FILTER_ROW
-
-If the latter is not disabled, you can disable one or more of the
-individual filter types by defining
-
-     PNG_NO_MMX_FILTER_SUB
-     PNG_NO_MMX_FILTER_UP
-     PNG_NO_MMX_FILTER_AVG
-     PNG_NO_MMX_FILTER_PAETH
-
-By default, libpng only enables the "sub" filter when gcc-3.x is used
-because experiments show that gcc-3.x produces bad code for the others.
-
-When you disable various MMX features, libpng will still be able to decode
-files with those features.  It will fall back upon the optimized C code
-or the unoptimized C code to decode them.
-
-
 VII.  MNG support
 
 The MNG specification (available at http://www.libpng.org/pub/mng) allows
@@ -2968,13 +2796,13 @@
 
 IX. Y2K Compliance in libpng
 
-August 30, 2007
+September 1, 2007
 
 Since the PNG Development group is an ad-hoc body, we can't make
 an official declaration.
 
 This is your unofficial assurance that libpng from version 0.71 and
-upward through 1.2.20rc3 are Y2K compliant.  It is my belief that earlier
+upward through 1.2.20rc4 are Y2K compliant.  It is my belief that earlier
 versions were also Y2K compliant.
 
 Libpng only has three year fields.  One is a 2-byte unsigned integer that
diff --git a/libpng.3 b/libpng.3
index d707341..d80f90b 100644
--- a/libpng.3
+++ b/libpng.3
@@ -1,6 +1,6 @@
-.TH LIBPNG 3 "August 30, 2007"
+.TH LIBPNG 3 "September 1, 2007"
 .SH NAME
-libpng \- Portable Network Graphics (PNG) Reference Library 1.2.20rc3
+libpng \- Portable Network Graphics (PNG) Reference Library 1.2.20rc4
 .SH SYNOPSIS
 \fB
 #include <png.h>\fP
@@ -410,7 +410,7 @@
 .SH LIBPNG.TXT
 libpng.txt - A description on how to use and modify libpng
 
- libpng version 1.2.20rc3 - August 30, 2007
+ libpng version 1.2.20rc4 - September 1, 2007
  Updated and distributed by Glenn Randers-Pehrson
  <glennrp at users.sourceforge.net>
  Copyright (c) 1998-2007 Glenn Randers-Pehrson
@@ -496,9 +496,7 @@
 instances of the structures.  Each thread should have its own
 png_struct and png_info instances, and thus its own image.
 Libpng does not protect itself against two threads using the
-same instance of a structure.  Note: thread safety may be defeated
-by use of some of the MMX assembler code in pnggccrd.c, which is only
-compiled when the user defines PNG_THREAD_UNSAFE_OK.
+same instance of a structure.
 
 .SH II. Structures
 
@@ -2765,17 +2763,13 @@
 
 .SH V. Modifying/Customizing libpng:
 
-There are three issues here.  The first is changing how libpng does
+There are two issues here.  The first is changing how libpng does
 standard things like memory allocation, input/output, and error handling.
 The second deals with more complicated things like adding new chunks,
 adding new transformations, and generally changing how libpng works.
 Both of those are compile-time issues; that is, they are generally
 determined at the time the code is written, and there is rarely a need
-to provide the user with a means of changing them.  The third is a
-run-time issue:  choosing between and/or tuning one or more alternate
-versions of computationally intensive routines; specifically, optimized
-assembly-language (and therefore compiler- and platform-dependent)
-versions.
+to provide the user with a means of changing them.
 
 Memory allocation, input/output, and error handling
 
@@ -3136,172 +3130,6 @@
 having level = 0 will be printed.  There aren't any such statements in
 this version of libpng, but if you insert some they will be printed.
 
-.SH VI.  Runtime optimization
-
-A new feature in libpng 1.2.0 is the ability to dynamically switch between
-standard and optimized versions of some routines.  Currently these are
-limited to three computationally intensive tasks when reading PNG files:
-decoding row filters, expanding interlacing, and combining interlaced or
-transparent row data with previous row data.  Currently the optimized
-versions are available only for x86 (Intel, AMD, etc.) platforms with
-MMX support, though this may change in future versions.  (For example,
-the non-MMX assembler optimizations for zlib might become similarly
-runtime-selectable in future releases, in which case libpng could be
-extended to support them.  Alternatively, the compile-time choice of
-floating-point versus integer routines for gamma correction might become
-runtime-selectable.)
-
-Because such optimizations tend to be very platform- and compiler-dependent,
-both in how they are written and in how they perform, the new runtime code
-in libpng has been written to allow programs to query, enable, and disable
-either specific optimizations or all such optimizations.  For example, to
-enable all possible optimizations (bearing in mind that some "optimizations"
-may actually run more slowly in rare cases):
-
-    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200)
-       png_uint_32 mask, flags;
-
-       flags = png_get_asm_flags(png_ptr);
-       mask = png_get_asm_flagmask(PNG_SELECT_READ | PNG_SELECT_WRITE);
-       png_set_asm_flags(png_ptr, flags | mask);
-    #endif
-
-To enable only optimizations relevant to reading PNGs, use PNG_SELECT_READ
-by itself when calling png_get_asm_flagmask(); similarly for optimizing
-only writing.  To disable all optimizations:
-
-    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200)
-       flags = png_get_asm_flags(png_ptr);
-       mask = png_get_asm_flagmask(PNG_SELECT_READ | PNG_SELECT_WRITE);
-       png_set_asm_flags(png_ptr, flags & ~mask);
-    #endif
-
-To enable or disable only MMX-related features, use png_get_mmx_flagmask()
-in place of png_get_asm_flagmask().  The mmx version takes one additional
-parameter:
-
-    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200)
-       int selection = PNG_SELECT_READ | PNG_SELECT_WRITE;
-       int compilerID;
-
-       mask = png_get_mmx_flagmask(selection, &compilerID);
-    #endif
-
-On return, compilerID will indicate which version of the MMX assembler
-optimizations was compiled.  Currently two flavors exist:  Microsoft
-Visual C++ (compilerID == 1) and GNU C (a.k.a. gcc/gas, compilerID == 2).
-On non-x86 platforms or on systems compiled without MMX optimizations, a
-value of -1 is used.
-
-Note that both png_get_asm_flagmask() and png_get_mmx_flagmask() return
-all valid, settable optimization bits for the version of the library that's
-currently in use.  In the case of shared (dynamically linked) libraries,
-this may include optimizations that did not exist at the time the code was
-written and compiled.  It is also possible, of course, to enable only known,
-specific optimizations; for example:
-
-    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200) && \
-       defined(PNG_USE_PNGGCCRD) && defined(PNG_ASSEMBLER_CODE_SUPPORTED)
-       flags = PNG_ASM_FLAG_MMX_READ_COMBINE_ROW  \
-             | PNG_ASM_FLAG_MMX_READ_INTERLACE    \
-             | PNG_ASM_FLAG_MMX_READ_FILTER_SUB   \
-             | PNG_ASM_FLAG_MMX_READ_FILTER_UP    \
-             | PNG_ASM_FLAG_MMX_READ_FILTER_AVG   \
-             | PNG_ASM_FLAG_MMX_READ_FILTER_PAETH ;
-       png_set_asm_flags(png_ptr, flags);
-    #endif
-
-This method would enable only the MMX read-optimizations available at the
-time of libpng 1.2.0's release, regardless of whether a later version of
-the DLL were actually being used.  (Also note that these functions did not
-exist in versions older than 1.2.0, so any attempt to run a dynamically
-linked app on such an older version would fail.)
-
-To determine whether the processor supports MMX instructions at all, use
-the png_mmx_support() function:
-
-    #if defined(PNG_LIBPNG_VER) && (PNG_LIBPNG_VER >= 10200)
-       mmxsupport = png_mmx_support();
-    #endif
-
-It returns -1 if MMX support is not compiled into libpng, 0 if MMX code
-is compiled but MMX is not supported by the processor, or 1 if MMX support
-is fully available.  Note that png_mmx_support(), png_get_mmx_flagmask(),
-and png_get_asm_flagmask() all may be called without allocating and ini-
-tializing any PNG structures (for example, as part of a usage screen or
-"about" box).
-
-The following code can be used to prevent an application from using the
-thread_unsafe features, even if libpng was built with PNG_THREAD_UNSAFE_OK
-defined:
-
-#if defined(PNG_USE_PNGGCCRD) && defined(PNG_ASSEMBLER_CODE_SUPPORTED) \
-  && defined(PNG_THREAD_UNSAFE_OK)
-    /* Disable thread-unsafe features of pnggccrd */
-    if (png_access_version_number() >= 10200)
-    {
-      png_uint_32 mmx_disable_mask = 0;
-      png_uint_32 asm_flags;
-
-      mmx_disable_mask |= ( PNG_ASM_FLAG_MMX_READ_COMBINE_ROW  \
-                          | PNG_ASM_FLAG_MMX_READ_FILTER_SUB   \
-                          | PNG_ASM_FLAG_MMX_READ_FILTER_AVG   \
-                          | PNG_ASM_FLAG_MMX_READ_FILTER_PAETH );
-      asm_flags = png_get_asm_flags(png_ptr);
-      png_set_asm_flags(png_ptr, asm_flags & ~mmx_disable_mask);
-    }
-#endif
-
-For more extensive examples of runtime querying, enabling and disabling
-of optimized features, see contrib/gregbook/readpng2.c in the libpng
-source-code distribution.
-
-It is also possible to disable or enable specific optimization features
-at compile time.  To disable them entirely, which may result in slower
-but smaller and less complex and troublesome code, define
-
-     PNG_NO_ASSEMBLER_CODE
-
-If you do this, then the run-time code for setting and querying flags
-described above, and the assembler code in pnggccrd.c or pngvcrd.c will
-not be built.
-
-If you have disabled the assembler code, you can also disable the
-optimized C code, to obtain even slower but smaller code, by defining
-
-     PNG_NO_OPTIMZED_CODE
-
-To disable only the MMX assembler code, define
-
-     PNG_NO_MMX_CODE
-
-There are two versions of the MMX code: one for gcc compilers and one for
-MSVC compilers.  Pngconf.h should automatically detect which you are
-using, and it will set either PNG_USE_PNGGCCRD or PNG_USE_PNGVCRD for you.
-
-Define one or more of the following to disable specific parts of the
-assembler code:
-
-     PNG_NO_MMX_COMBINE_ROW
-     PNG_NO_MMX_READ_INTERLACE
-     PNG_NO_MMX_READ_FILTER_ROW
-
-If the latter is not disabled, you can disable one or more of the
-individual filter types by defining
-
-     PNG_NO_MMX_FILTER_SUB
-     PNG_NO_MMX_FILTER_UP
-     PNG_NO_MMX_FILTER_AVG
-     PNG_NO_MMX_FILTER_PAETH
-
-By default, libpng only enables the "sub" filter when gcc-3.x is used
-because experiments show that gcc-3.x produces bad code for the others.
-
-When you disable various MMX features, libpng will still be able to decode
-files with those features.  It will fall back upon the optimized C code
-or the unoptimized C code to decode them.
-
-
 .SH VII.  MNG support
 
 The MNG specification (available at http://www.libpng.org/pub/mng) allows
@@ -3378,13 +3206,13 @@
 
 .SH IX. Y2K Compliance in libpng
 
-August 30, 2007
+September 1, 2007
 
 Since the PNG Development group is an ad-hoc body, we can't make
 an official declaration.
 
 This is your unofficial assurance that libpng from version 0.71 and
-upward through 1.2.20rc3 are Y2K compliant.  It is my belief that earlier
+upward through 1.2.20rc4 are Y2K compliant.  It is my belief that earlier
 versions were also Y2K compliant.
 
 Libpng only has three year fields.  One is a 2-byte unsigned integer that
@@ -3579,8 +3407,8 @@
  1.0.27              10    10027  10.so.0.27[.0]
  1.2.19              13    10219  12.so.0.19[.0]
  1.2.20beta01-04     13    10220  12.so.0.20[.0]
- 1.0.28rc1-2         10    10028  10.so.0.28[.0]
- 1.2.20rc1-2         13    10220  12.so.0.20[.0]
+ 1.0.28rc1-4         10    10028  10.so.0.28[.0]
+ 1.2.20rc1-4         13    10220  12.so.0.20[.0]
  1.0.28              10    10028  10.so.0.28[.0]
  1.2.20              13    10220  12.so.0.20[.0]
 
@@ -3638,7 +3466,7 @@
 
 Thanks to Frank J. T. Wojcik for helping with the documentation.
 
-Libpng version 1.2.20rc3 - August 30, 2007:
+Libpng version 1.2.20rc4 - September 1, 2007:
 Initially created in 1995 by Guy Eric Schalnat, then of Group 42, Inc.
 Currently maintained by Glenn Randers-Pehrson (glennrp at users.sourceforge.net).
 
@@ -3659,7 +3487,7 @@
 If you modify libpng you may insert additional notices immediately following
 this sentence.
 
-libpng versions 1.2.6, August 15, 2004, through 1.2.20rc3, August 30, 2007, are
+libpng versions 1.2.6, August 15, 2004, through 1.2.20rc4, September 1, 2007, are
 Copyright (c) 2004,2006-2007 Glenn Randers-Pehrson, and are
 distributed according to the same disclaimer and license as libpng-1.2.5
 with the following individual added to the list of Contributing Authors
@@ -3758,7 +3586,7 @@
 
 Glenn Randers-Pehrson
 glennrp at users.sourceforge.net
-August 30, 2007
+September 1, 2007
 
 .\" end of man page
 
diff --git a/libpngpf.3 b/libpngpf.3
index 1aea5a8..06fddfa 100644
--- a/libpngpf.3
+++ b/libpngpf.3
@@ -1,6 +1,6 @@
-.TH LIBPNGPF 3 "August 30, 2007"
+.TH LIBPNGPF 3 "September 1, 2007"
 .SH NAME
-libpng \- Portable Network Graphics (PNG) Reference Library 1.2.20rc3
+libpng \- Portable Network Graphics (PNG) Reference Library 1.2.20rc4
 (private functions)
 .SH SYNOPSIS
 \fB#include <png.h>\fP
diff --git a/png.5 b/png.5
index fadb858..317b7ae 100644
--- a/png.5
+++ b/png.5
@@ -1,4 +1,4 @@
-.TH PNG 5 "August 30, 2007"
+.TH PNG 5 "September 1, 2007"
 .SH NAME
 png \- Portable Network Graphics (PNG) format
 .SH DESCRIPTION
diff --git a/png.c b/png.c
index 54d6f7a..e6ffcd4 100644
--- a/png.c
+++ b/png.c
@@ -13,7 +13,7 @@
 #include "png.h"
 
 /* Generate a compiler error if there is an old png.h in the search path. */
-typedef version_1_2_20rc3 Your_png_h_is_not_version_1_2_20rc3;
+typedef version_1_2_20rc4 Your_png_h_is_not_version_1_2_20rc4;
 
 /* Version information for C files.  This had better match the version
  * string defined in png.h.  */
@@ -67,11 +67,6 @@
 /* offset to next interlace block in the y direction */
 PNG_CONST int FARDATA png_pass_yinc[] = {8, 8, 8, 4, 4, 2, 2};
 
-/* width of interlace block (used in assembler routines only) */
-#if defined(PNG_HAVE_MMX_COMBINE_ROW) || defined(PNG_OPTIMIZED_CODE_SUPPORTED)
-PNG_CONST int FARDATA png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
-#endif
-
 /* Height of interlace block.  This is not currently used - if you need
  * it, uncomment it here and in png.h
 PNG_CONST int FARDATA png_pass_height[] = {8, 8, 4, 4, 2, 2, 1};
@@ -698,7 +693,7 @@
 png_get_copyright(png_structp png_ptr)
 {
    png_ptr = png_ptr;  /* silence compiler warning about unused png_ptr */
-   return ((png_charp) "\n libpng version 1.2.20rc3 - August 30, 2007\n\
+   return ((png_charp) "\n libpng version 1.2.20rc4 - September 1, 2007\n\
    Copyright (c) 1998-2007 Glenn Randers-Pehrson\n\
    Copyright (c) 1996-1997 Andreas Dilger\n\
    Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.\n");
@@ -825,57 +820,18 @@
 void /* PRIVATE */
 png_init_mmx_flags (png_structp png_ptr)
 {
-    if(png_ptr == NULL) return;
-    png_ptr->mmx_rowbytes_threshold = 0;
-    png_ptr->mmx_bitdepth_threshold = 0;
-
-#  if (defined(PNG_USE_PNGVCRD) || defined(PNG_USE_PNGGCCRD))
-
-    png_ptr->asm_flags |= PNG_ASM_FLAG_MMX_SUPPORT_COMPILED;
-
-    if (png_mmx_support() > 0) {
-        png_ptr->asm_flags |= PNG_ASM_FLAG_MMX_SUPPORT_IN_CPU
-#    ifdef PNG_HAVE_MMX_COMBINE_ROW
-                              | PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
-#    endif
-#    ifdef PNG_HAVE_MMX_READ_INTERLACE
-                              | PNG_ASM_FLAG_MMX_READ_INTERLACE
-#    endif
-#    ifndef PNG_HAVE_MMX_READ_FILTER_ROW
-                              ;
-#    else
-                              | PNG_ASM_FLAG_MMX_READ_FILTER_SUB
-                              | PNG_ASM_FLAG_MMX_READ_FILTER_UP
-                              | PNG_ASM_FLAG_MMX_READ_FILTER_AVG
-                              | PNG_ASM_FLAG_MMX_READ_FILTER_PAETH ;
-
-        png_ptr->mmx_rowbytes_threshold = PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT;
-        png_ptr->mmx_bitdepth_threshold = PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT;
-#    endif
-    } else {
-        png_ptr->asm_flags &= ~( PNG_ASM_FLAG_MMX_SUPPORT_IN_CPU
-                               | PNG_MMX_READ_FLAGS
-                               | PNG_MMX_WRITE_FLAGS );
-    }
-
-#  else /* !(PNGVCRD || PNGGCCRD) */
-
-    /* clear all MMX flags; no support is compiled in */
-    png_ptr->asm_flags &= ~( PNG_MMX_FLAGS );
-
-#  endif /* ?(PNGVCRD || PNGGCCRD) */
+   /* obsolete, to be removed from libpng-1.4.0 */
 }
 
 #endif /* !(PNG_MMX_CODE_SUPPORTED) */
 
 /* this function was added to libpng 1.2.0 */
-#if !defined(PNG_USE_PNGGCCRD) && !defined(PNG_USE_PNGVCRD)
 int PNGAPI
 png_mmx_support(void)
 {
+   /* obsolete, to be removed from libpng-1.4.0 */
     return -1;
 }
-#endif
 #endif /* PNG_1_0_X */
 #endif /* PNG_READ_SUPPORTED && PNG_ASSEMBLER_CODE_SUPPORTED */
 
diff --git a/png.h b/png.h
index a26a725..a4aa8d0 100644
--- a/png.h
+++ b/png.h
@@ -1,7 +1,7 @@
 
 /* png.h - header file for PNG reference library
  *
- * libpng version 1.2.20rc3 - August 30, 2007
+ * libpng version 1.2.20rc4 - September 1, 2007
  * Copyright (c) 1998-2007 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
  * (Version 0.88 Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.)
@@ -9,7 +9,7 @@
  * Authors and maintainers:
  *  libpng versions 0.71, May 1995, through 0.88, January 1996: Guy Schalnat
  *  libpng versions 0.89c, June 1996, through 0.96, May 1997: Andreas Dilger
- *  libpng versions 0.97, January 1998, through 1.2.20rc3 - August 30, 2007: Glenn
+ *  libpng versions 0.97, January 1998, through 1.2.20rc4 - September 1, 2007: Glenn
  *  See also "Contributing Authors", below.
  *
  * Note about libpng version numbers:
@@ -156,8 +156,8 @@
  *    1.0.27                  10    10027  10.so.0.27[.0]
  *    1.2.19                  13    10219  12.so.0.19[.0]
  *    1.2.20beta01-04         13    10220  12.so.0.20[.0]
- *    1.0.28rc1-2             10    10028  10.so.0.28[.0]
- *    1.2.20rc1-2             13    10220  12.so.0.20[.0]
+ *    1.0.28rc1-4             10    10028  10.so.0.28[.0]
+ *    1.2.20rc1-4             13    10220  12.so.0.20[.0]
  *    1.0.28                  10    10028  10.so.0.28[.0]
  *    1.2.20                  13    10220  12.so.0.20[.0]
  *
@@ -189,7 +189,7 @@
  * If you modify libpng you may insert additional notices immediately following
  * this sentence.
  *
- * libpng versions 1.2.6, August 15, 2004, through 1.2.20rc3, August 30, 2007, are
+ * libpng versions 1.2.6, August 15, 2004, through 1.2.20rc4, September 1, 2007, are
  * Copyright (c) 2004, 2006-2007 Glenn Randers-Pehrson, and are
  * distributed according to the same disclaimer and license as libpng-1.2.5
  * with the following individual added to the list of Contributing Authors:
@@ -301,13 +301,13 @@
  * Y2K compliance in libpng:
  * =========================
  *
- *    August 30, 2007
+ *    September 1, 2007
  *
  *    Since the PNG Development group is an ad-hoc body, we can't make
  *    an official declaration.
  *
  *    This is your unofficial assurance that libpng from version 0.71 and
- *    upward through 1.2.20rc3 are Y2K compliant.  It is my belief that earlier
+ *    upward through 1.2.20rc4 are Y2K compliant.  It is my belief that earlier
  *    versions were also Y2K compliant.
  *
  *    Libpng only has three year fields.  One is a 2-byte unsigned integer
@@ -363,9 +363,9 @@
  */
 
 /* Version information for png.h - this should match the version in png.c */
-#define PNG_LIBPNG_VER_STRING "1.2.20rc3"
+#define PNG_LIBPNG_VER_STRING "1.2.20rc4"
 #define PNG_HEADER_VERSION_STRING \
-   " libpng version 1.2.20rc3 - August 30, 2007\n"
+   " libpng version 1.2.20rc4 - September 1, 2007\n"
 
 #define PNG_LIBPNG_VER_SONUM   0
 #define PNG_LIBPNG_VER_DLLNUM  13
@@ -377,7 +377,7 @@
 /* This should match the numeric part of the final component of
  * PNG_LIBPNG_VER_STRING, omitting any leading zero: */
 
-#define PNG_LIBPNG_VER_BUILD  3
+#define PNG_LIBPNG_VER_BUILD  4
 
 /* Release Status */
 #define PNG_LIBPNG_BUILD_ALPHA    1
@@ -504,9 +504,6 @@
 PNG_EXPORT_VAR (PNG_CONST int FARDATA) png_pass_yinc[7];
 PNG_EXPORT_VAR (PNG_CONST int FARDATA) png_pass_mask[7];
 PNG_EXPORT_VAR (PNG_CONST int FARDATA) png_pass_dsp_mask[7];
-#if defined(PNG_HAVE_MMX_COMBINE_ROW) || defined(PNG_OPTIMIZED_CODE_SUPPORTED)
-PNG_EXPORT_VAR (PNG_CONST int FARDATA) png_pass_width[7];
-#endif
 /* This isn't currently used.  If you need it, see png.c for more details.
 PNG_EXPORT_VAR (PNG_CONST int FARDATA) png_pass_height[7];
 */
@@ -1279,8 +1276,7 @@
      png_size_t current_text_left;   /* how much text left to read in input */
      png_charp current_text;         /* current text chunk buffer */
      png_charp current_text_ptr;     /* current location in current_text */
-#  endif /* PNG_PROGRESSIVE_READ_SUPPORTED && PNG_TEXT_SUPPORTED */
-
+#  endif /* PNG_TEXT_SUPPORTED */
 #endif /* PNG_PROGRESSIVE_READ_SUPPORTED */
 
 #if defined(__TURBOC__) && !defined(_Windows) && !defined(__FLAT__)
@@ -1416,7 +1412,7 @@
 /* This triggers a compiler error in png.c, if png.c and png.h
  * do not agree upon the version number.
  */
-typedef png_structp version_1_2_20rc3;
+typedef png_structp version_1_2_20rc4;
 
 typedef png_struct FAR * FAR * png_structpp;
 
diff --git a/pngconf.h b/pngconf.h
index d5ef5e3..01d7578 100644
--- a/pngconf.h
+++ b/pngconf.h
@@ -1,7 +1,7 @@
 
 /* pngconf.h - machine configurable file for libpng
  *
- * libpng version 1.2.20rc3 - August 30, 2007
+ * libpng version 1.2.20rc4 - September 1, 2007
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998-2007 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
@@ -600,10 +600,10 @@
 #endif /* PNG_READ_TRANSFORMS_SUPPORTED */
 
 #if !defined(PNG_NO_PROGRESSIVE_READ) && \
- !defined(PNG_PROGRESSIVE_READ_NOT_SUPPORTED) /* if you don't do progressive */
-#  define PNG_PROGRESSIVE_READ_SUPPORTED    /* reading.  This is not talking */
-#endif                              /* about interlacing capability!  You'll */
-             /* still have interlacing unless you change the following line: */
+ !defined(PNG_PROGRESSIVE_READ_SUPPORTED) /* if you don't do progressive   */
+#  define PNG_PROGRESSIVE_READ_SUPPORTED  /* reading.  This is not talking */
+#endif                            /* about interlacing capability!  You'll */
+           /* still have interlacing unless you change the following line: */
 
 #define PNG_READ_INTERLACING_SUPPORTED /* required in PNG-compliant decoders */
 
@@ -727,15 +727,8 @@
 #endif
 
 /* PNG_ASSEMBLER_CODE was enabled by default in version 1.2.0 
- * even when PNG_USE_PNGVCRD or PNG_USE_PNGGCCRD is not defined.
- *
- * PNG_NO_ASSEMBLER_CODE disables use of all assembler code,
- * and removes several functions from the API.
- *
- * PNG_NO_MMX_CODE disables the use of MMX code without changing the API.
- * When MMX code is off, then optimized C replacement functions are used,
- * if PNG_NO_OPTIMIZED_CODE is not enabled.  This was added in version
- * 1.2.19.
+ * and removed from version 1.2.20.  The following will be removed
+ * from libpng-1.4.0
 */
 
 #if defined(PNG_READ_SUPPORTED) && !defined(PNG_NO_OPTIMIZED_CODE)
@@ -762,6 +755,12 @@
 #    endif
 #  endif
 
+#  if (defined(__MWERKS__) && ((__MWERKS__ < 0x0900) || macintosh))
+#    if !defined(PNG_MMX_CODE_SUPPORTED) && !defined(PNG_NO_MMX_CODE)
+#      define PNG_NO_MMX_CODE
+#    endif
+#  endif
+
 #  if !defined(PNG_MMX_CODE_SUPPORTED) && !defined(PNG_NO_MMX_CODE)
 #    define PNG_MMX_CODE_SUPPORTED
 #  endif
@@ -774,14 +773,11 @@
 #  if !defined(PNG_USE_PNGGCCRD) && defined(PNG_MMX_CODE_SUPPORTED) && \
      !defined(PNG_USE_PNGVCRD)
 #    define PNG_USE_PNGGCCRD
-     /* If you are sure that you don't need thread safety and you are compiling
-        with PNG_USE_PNGCCRD for an MMX application, you can define this for
-        faster execution.  See pnggccrd.c.
 #    define PNG_THREAD_UNSAFE_OK
-     */
 #  endif
 
 #endif
+/* end of obsolete code to be removed from libpng-1.4.0 */
 
 #if !defined(PNG_1_0_X)
 #if !defined(PNG_NO_USER_MEM) && !defined(PNG_USER_MEM_SUPPORTED)
@@ -1489,53 +1485,6 @@
 #  define PNG_ZBUF_SIZE 65536L
 #endif
 
-#ifdef PNG_READ_SUPPORTED
-/* Prior to libpng-1.0.9, this block was in pngasmrd.h */
-#if defined(PNG_INTERNAL)
-
-#if defined(PNG_USE_PNGGCCRD) || defined(PNG_USE_PNGVCRD)
-  /* Platform must be Pentium.  Makefile must assemble and load
-   * pnggccrd.c or  pngvcrd.c. MMX will be detected at run time and
-   * used if present.
-   */
-#  ifndef PNG_NO_MMX_COMBINE_ROW
-#    define PNG_HAVE_MMX_COMBINE_ROW
-#  endif
-#  ifndef PNG_NO_MMX_READ_INTERLACE
-#    define PNG_HAVE_MMX_READ_INTERLACE
-#  endif
-#  ifndef PNG_NO_MMX_READ_FILTER_ROW
-#    define PNG_HAVE_MMX_READ_FILTER_ROW
-#    ifndef PNG_NO_MMX_FILTER_SUB
-#      define PNG_MMX_READ_FILTER_SUB_SUPPORTED
-#    endif
-#    ifndef PNG_NO_MMX_FILTER_UP
-#      define PNG_MMX_READ_FILTER_UP_SUPPORTED
-#    endif
-#    ifndef PNG_NO_MMX_FILTER_AVG
-#      define PNG_MMX_READ_FILTER_AVG_SUPPORTED
-#    endif
-#    ifndef PNG_NO_MMX_FILTER_PAETH
-#      define PNG_MMX_READ_FILTER_PAETH_SUPPORTED
-#    endif
-#  endif
-  /* These are the default thresholds before the MMX code kicks in; if either
-   * rowbytes or bitdepth is below the threshold, plain C code is used.  These
-   * can be overridden at runtime via the png_set_mmx_thresholds() call in
-   * libpng 1.2.0 and later.  The values below were chosen by Intel.
-   */
-#  ifndef PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT
-#    define PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT  128  /*  >=  */
-#  endif
-#  ifndef PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT
-#    define PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT  9    /*  >=  */   
-#  endif
-#endif /* PNG_USE_PNGGCCRD || PNG_USE_PNGVCRD */
-/* - see pngvcrd.c or pnggccrd.c for info about what is currently enabled */
-
-#endif /* PNG_INTERNAL */
-#endif /* PNG_READ_SUPPORTED */
-
 /* Added at libpng-1.2.8 */
 #endif /* PNG_VERSION_INFO_ONLY */
 
diff --git a/pngerror.c b/pngerror.c
index 93ce47b..2d03c56 100644
--- a/pngerror.c
+++ b/pngerror.c
@@ -1,7 +1,7 @@
 
 /* pngerror.c - stub functions for i/o and memory allocation
  *
- * Last changed in libpng 1.2.20 August 30, 2007
+ * Last changed in libpng 1.2.20 September 1, 2007
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998-2007 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
diff --git a/pnggccrd.c b/pnggccrd.c
index 37f3f33..7007ad9 100644
--- a/pnggccrd.c
+++ b/pnggccrd.c
@@ -1,6042 +1 @@
-
-/* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
- *
- * For Intel/AMD x86 or x86-64 CPU (Pentium-MMX or later) and GNU C compiler.
- *
- * Last changed in libpng 1.2.19 August 19, 2007
- * For conditions of distribution and use, see copyright notice in png.h
- * Copyright (c) 1998 Intel Corporation
- * Copyright (c) 1999-2002,2007 Greg Roelofs
- * Copyright (c) 1998-2007 Glenn Randers-Pehrson
- *
- * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
- * Interface to libpng contributed by Gilles Vollant, 1999.
- * GNU C port by Greg Roelofs, 1999-2001.
- *
- * References:
- *
- *     http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
- *     http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
- *       [Intel's performance analysis of the MMX vs. non-MMX code;
- *        moved/deleted as of 2006, but text and some graphs still
- *        available via WayBack Machine at archive.org]
- *
- *     http://www.ibiblio.org/gferg/ldp/GCC-Inline-Assembly-HOWTO.html
- *     http://sam.zoy.org/blog/2007-04-13-shlib-with-non-pic-code-have-inline-assembly-and-pic-mix-well
- *     http://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html
- *     http://gcc.gnu.org/onlinedocs/gcc/Variable-Attributes.html
- *     http://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html
- *     AMD64 Architecture Programmer's Manual, volumes 1 and 5
- *       [http://www.amd.com/us-en/Processors/TechnicalResources/0,,30_182_739_7044,00.html]
- *     Intel 64 and IA-32 Software Developer's Manuals
- *       [http://developer.intel.com/products/processor/manuals/]
- *
- * png_read_filter_row_mmx_*() were converted in place with intel2gas 1.3.1:
- *
- *     intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
- *
- * and then cleaned up by hand.  See http://hermes.terminal.at/intel2gas/ .
- *
- * NOTE:  A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
- * is required to assemble the newer asm instructions such as movq.  (Version
- * 2.5.2l.15 is definitely too old.)  See ftp://ftp.gnu.org/pub/gnu/binutils/ .
- */
-
-/*
- * PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
- * ===========================
- *
- * 19991006:
- *  - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
- *
- * 19991007:
- *  - additional optimizations (possible or definite):
- *     x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
- *     - write MMX code for 48-bit case (pixel_bytes == 6)
- *     - figure out what's up with 24-bit case (pixel_bytes == 3):
- *        why subtract 8 from width_mmx in the pass 4/5 case?
- *        (only width_mmx case) (near line 2335)
- *     x [DONE] replace pixel_bytes within each block with the true
- *        constant value (or are compilers smart enough to do that?)
- *     - rewrite all MMX interlacing code so it's aligned with
- *        the *beginning* of the row buffer, not the end.  This
- *        would not only allow one to eliminate half of the memory
- *        writes for odd passes (that is, pass == odd), it may also
- *        eliminate some unaligned-data-access exceptions (assuming
- *        there's a penalty for not aligning 64-bit accesses on
- *        64-bit boundaries).  The only catch is that the "leftover"
- *        pixel(s) at the end of the row would have to be saved,
- *        but there are enough unused MMX registers in every case,
- *        so this is not a problem.  A further benefit is that the
- *        post-MMX cleanup code (C code) in at least some of the
- *        cases could be done within the assembler block.
- *  x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
- *     inconsistent, and don't match the MMX Programmer's Reference
- *     Manual conventions anyway.  They should be changed to
- *     "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
- *     was lowest in memory (i.e., corresponding to a left pixel)
- *     and b7 is the byte that was highest (i.e., a right pixel).
- *
- * 19991016:
- *  - Brennan's Guide notwithstanding, gcc under Linux does *not*
- *     want globals prefixed by underscores when referencing them--
- *     i.e., if the variable is const4, then refer to it as const4,
- *     not _const4.  This seems to be a djgpp-specific requirement.
- *     Also, such variables apparently *must* be declared outside
- *     of functions; neither static nor automatic variables work if
- *     defined within the scope of a single function, but both
- *     static and truly global (multi-module) variables work fine.
- *
- * 19991017:
- *  - replaced pixel_bytes in each png_memcpy() call with constant value for
- *     inlining (png_do_read_interlace() "non-MMX/modified C code" block)
- *
- * 19991023:
- *  - fixed png_combine_row() non-MMX replication bug (odd passes only?)
- *  - switched from string-concatenation-with-macros to cleaner method of
- *     renaming global variables for djgpp--i.e., always use prefixes in
- *     inlined assembler code (== strings) and conditionally rename the
- *     variables, not the other way around.  Hence _const4, _mask8_0, etc.
- *
- * 19991024:
- *  - fixed mmxsupport()/png_do_read_interlace() first-row bug
- *     This one was severely weird:  even though mmxsupport() doesn't touch
- *     ebx (where "row" pointer was stored), it nevertheless managed to zero
- *     the register (even in static/non-fPIC code--see below), which in turn
- *     caused png_do_read_interlace() to return prematurely on the first row of
- *     interlaced images (i.e., without expanding the interlaced pixels).
- *     Inspection of the generated assembly code didn't turn up any clues,
- *     although it did point at a minor optimization (i.e., get rid of
- *     mmx_supported_local variable and just use eax).  Possibly the CPUID
- *     instruction is more destructive than it looks?  (Not yet checked.)
- *  - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
- *     listings...  Apparently register spillage has to do with ebx, since
- *     it's used to index the global offset table.  Commenting it out of the
- *     input-reg lists in png_combine_row() eliminated compiler barfage, so
- *     ifdef'd with __PIC__ macro:  if defined, use a global for unmask
- *
- * 19991107:
- *  - verified CPUID clobberage:  12-char string constant ("GenuineIntel",
- *     "AuthenticAMD", etc.) placed in ebx:ecx:edx.  Still need to polish.
- *
- * 19991120:
- *  - made "diff" variable (now "_dif") global to simplify conversion of
- *     filtering routines (running out of regs, sigh).  "diff" is still used
- *     in interlacing routines, however.
- *  - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
- *     macro determines which is used); original not yet tested.
- *
- * 20000213:
- *  - when compiling with gcc, be sure to use  -fomit-frame-pointer
- *
- * 20000319:
- *  - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
- *     pass == 4 or 5, that caused visible corruption of interlaced images
- *
- * 20000623:
- *  - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
- *     many of the form "forbidden register 0 (ax) was spilled for class AREG."
- *     This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
- *     Chuck Wilson supplied a patch involving dummy output registers.  See
- *     http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
- *     for the original (anonymous) SourceForge bug report.
- *
- * 20000706:
- *  - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
- *       pnggccrd.c: In function `png_combine_row':
- *       pnggccrd.c:525: more than 10 operands in `asm'
- *       pnggccrd.c:669: more than 10 operands in `asm'
- *       pnggccrd.c:828: more than 10 operands in `asm'
- *       pnggccrd.c:994: more than 10 operands in `asm'
- *       pnggccrd.c:1177: more than 10 operands in `asm'
- *     They are all the same problem and can be worked around by using the
- *     global _unmask variable unconditionally, not just in the -fPIC case.
- *     Reportedly earlier versions of gcc also have the problem with more than
- *     10 operands; they just don't report it.  Much strangeness ensues, etc.
- *
- * 20000729:
- *  - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
- *     MMX routine); began converting png_read_filter_row_mmx_sub()
- *  - to finish remaining sections:
- *     - clean up indentation and comments
- *     - preload local variables
- *     - add output and input regs (order of former determines numerical
- *        mapping of latter)
- *     - avoid all usage of ebx (including bx, bh, bl) register [20000823]
- *     - remove "$" from addressing of Shift and Mask variables [20000823]
- *
- * 20000731:
- *  - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
- *
- * 20000822:
- *  - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
- *     shared-library (-fPIC) version!  Code works just fine as part of static
- *     library.  Should have tested that sooner.
- *     ebx is getting clobbered again (explicitly this time); need to save it
- *     on stack or rewrite asm code to avoid using it altogether.  Blargh!
- *
- * 20000823:
- *  - first section was trickiest; all remaining sections have ebx -> edx now.
- *     (-fPIC works again.)  Also added missing underscores to various Shift*
- *     and *Mask* globals and got rid of leading "$" signs.
- *
- * 20000826:
- *  - added visual separators to help navigate microscopic printed copies
- *     (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
- *     on png_read_filter_row_mmx_avg()
- *
- * 20000828:
- *  - finished png_read_filter_row_mmx_avg():  only Paeth left! (930 lines...)
- *     What the hell, did png_read_filter_row_mmx_paeth(), too.  Comments not
- *     cleaned up/shortened in either routine, but functionality is complete
- *     and seems to be working fine.
- *
- * 20000829:
- *  - ahhh, figured out last(?) bit of gcc/gas asm-fu:  if register is listed
- *     as an input reg (with dummy output variables, etc.), then it *cannot*
- *     also appear in the clobber list or gcc 2.95.2 will barf.  The solution
- *     is simple enough...
- *
- * 20000914:
- *  - bug in png_read_filter_row_mmx_avg():  16-bit grayscale not handled
- *     correctly (but 48-bit RGB just fine)
- *
- * 20000916:
- *  - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
- *     - "_ShiftBpp.use = 24;"      should have been   "_ShiftBpp.use = 16;"
- *     - "_ShiftRem.use = 40;"      should have been   "_ShiftRem.use = 48;"
- *     - "psllq _ShiftRem, %%mm2"   should have been   "psrlq _ShiftRem, %%mm2"
- *
- * 20010101:
- *  - added new png_init_mmx_flags() function (here only because it needs to
- *     call mmxsupport(), which should probably become global png_mmxsupport());
- *     modified other MMX routines to run conditionally (png_ptr->asm_flags)
- *
- * 20010103:
- *  - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
- *     and made it public; moved png_init_mmx_flags() to png.c as internal func
- *
- * 20010104:
- *  - removed dependency on png_read_filter_row_c() (C code already duplicated
- *     within MMX version of png_read_filter_row()) so no longer necessary to
- *     compile it into pngrutil.o
- *
- * 20010310:
- *  - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
- *
- * 20010808:
- *  - added PNG_THREAD_UNSAFE_OK around code using global variables [GR-P]
- *
- * 20011124:
- *  - fixed missing save of Eflag in png_mmx_support() [Maxim Sobolev]
- *
- * 20020304:
- *  - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
- *
- * 20020407:
- *  - fixed insufficient preservation of ebx register [Sami Farin]
- *
- * 20040724:
- *  - more tinkering with clobber list at lines 4529 and 5033 to get it to
- *     compile with gcc 3.4 [GR-P]
- *
- * 20040809:
- *  - added "rim" definitions for CONST4 and CONST6 [GR-P]
- *
- * 20060303:
- *  - added "OS2" to list of systems that don't need leading underscores [GR-P]
- *
- * 20060320:
- *  - made PIC-compliant [Christian Aichinger]
- *
- * 20070313:
- *  - finally applied Giuseppe Ghibò's 64-bit patch of 20060803 (completely
- *     overlooked Dylan Alex Simon's similar patch of 20060414, oops...)
- *
- * 20070524:
- *  - fixed link failure caused by asm-only variables being optimized out
- *     (identified by Dimitri of Trolltech) with __attribute__((used)), which
- *     also gets rid of warnings => nuked ugly png_squelch_warnings() hack
- *  - dropped redundant ifdef
- *  - moved png_mmx_support() back up where originally intended (as in
- *     pngvcrd.c), using __attribute__((noinline)) in extra prototype
- *
- * 20070527:
- *  - revised png_combine_row() to reuse mask in lieu of external _unmask
- *  - moved 32-bit (RGBA) case to top of png_combine_row():  most common
- *  - just about ready to give up on x86-64 -fPIC mode; can't even access 16
- *     _mask*_* constants without triggering link error on shared library:
- *       /usr/bin/ld: pnggccrd.pic.o: relocation R_X86_64_32S against `a local
- *         symbol' can not be used when making a shared object; recompile with
- *         -fPIC
- *       pnggccrd.pic.o: could not read symbols: Bad value
- *       ("objdump -x pnggccrd.pic.o | grep rodata" to verify)
- *     [might be able to work around by doing within assembly code whatever
- *     -fPIC does, but given problems to date, seems like long shot...]
- *     [relevant ifdefs:  __x86_64__ && __PIC__ => C code only]
- *  - changed #if 0 to #ifdef PNG_CLOBBER_MMX_REGS_SUPPORTED in case gcc ever
- *     supports MMX regs (%mm0, etc.) in clobber list (not supported by gcc
- *     2.7.2.3, 2.91.66 (egcs 1.1.2), 3.x, or 4.1.2)
- *
- * 20070603:
- *  - revised png_combine_row() to use @GOTPCREL(%%rip) addressing on _c64
- *     struct of _mask*_* constants for x86-64 -fPIC; see sam.zoy.org link
- *     above for details
- *  - moved _const4 and _const6 into _c64 struct, renamed to _amask5_3_0 and
- *     _amask7_1_0, respectively
- *  - can't figure out how to use _c64._mask*_* vars within asm code, so still
- *     need single variables for non-x86-64/-fPIC half :-(
- *  - replaced various __PIC__ ifdefs with *_GOT_ebx macros
- *  - moved _LBCarryMask and _HBClearMask into _c64 struct
- *  - conditionally replaced _p*temp variables with %r11d-%r13d (via p*_TEMP
- *     and CLOBBER_r1*d macros)
- *
- * 20070604:
- *  - replaced all _ActiveMask and _ActiveMaskEnd with new _amask*_*_* consts
- *     (_amask naming convention:  numbers of 00-bytes, ff-bytes, 00-bytes)
- *    - _ActiveMask     // (10) // avg/paeth/sub; read-only; consts; movq/pand
- *       0x0000000000ffffffLL (bpp 3, avg)      _amask5_3_0
- *       0xffffffffffffffffLL (bpp 4, 6, avg)   _amask0_8_0
- *       0x000000000000ffffLL (bpp 2, avg)      _amask6_2_0
- *       0x0000000000ffffffLL (bpp 3, paeth)    _amask5_3_0
- *       0x00000000ffffffffLL (bpp 6, paeth)    _amask4_4_0
- *       0x00000000ffffffffLL (bpp 4, paeth)    _amask4_4_0
- *       0x00000000ffffffffLL (bpp 8, paeth)    _amask4_4_0
- *       0x0000ffffff000000LL (bpp 3, sub)      _amask2_3_3
- *       0x00000000ffff0000LL (bpp 2, sub)      _amask4_2_2
- *    - _ActiveMaskEnd  // (1)  // paeth only; read-only; const; pand
- *       0xffff000000000000LL (bpp 3, paeth)    _amask0_2_6
- *  - changed all "#if defined(__x86_64__) // later // && defined(__PIC__)"
- *     lines to "#ifdef PNG_x86_64_USE_GOTPCREL" for easier/safer testing
- *
- * 20070605:
- *  - merged PNG_x86_64_USE_GOTPCREL, non-PNG_x86_64_USE_GOTPCREL code via
- *     *MASK* and LOAD/RESTORE macros
- *
- * 20070607:
- *  - replaced all constant instances of _ShiftBpp, _ShiftRem with immediates
- *     (still have two shared cases in avg, sub routines)
- *
- * 20070609:
- *  - replaced remaining instances of _ShiftBpp, _ShiftRem with immediates
- *     (split sub and avg 4/6-bpp cases into separate blocks)
- *  - fixed paeth bug due to clobbered r11/r12/r13 regs
- *
- * 20070610:
- *  - made global "_dif" variable (avg/paeth/sub routines) local again (now
- *     "diff"--see 19991120 entry above), using register constraints
- *  - note that %ebp in clobber list doesn't actually work, at least for 32-bit
- *     version and gcc 4.1.2; must save and restore manually.  (Seems to work
- *     OK for 64-bit version and gcc 3.4.3, but gcc may not be using ebp/rbp
- *     in that case.)
- *  - started replacing direct _MMXLength accesses with register constraints
- *
- * 20070612:
- *  - continued replacing direct _MMXLength accesses with register constraints
- *
- * 20070613:
- *  - finished replacing direct _MMXLength accesses with register constraints;
- *     switched to local variable (and renamed back to MMXLength)
- *
- * 20070614:
- *  - fixed sub bpp = 1 bug
- *  - started replacing direct _FullLength accesses with register constraints
- *
- * 20070615:
- *  - fixed 64-bit paeth bpp 3 crash bug (misplaced LOAD_GOT_rbp)
- *  - fixed 64-bit paeth bpp 1/2 and cleanup-block crash bugs (misplaced
- *     RESTORE_r11_r12_r13)
- *  - slightly optimized avg/paeth cleanup blocks and paeth bpp 1/2 block
- *     (save/restore ebx only if needed)
- *  - continued replacing direct _FullLength accesses with register constraints
- *
- * 20070616:
- *  - finished replacing direct _FullLength accesses with register constraints
- *     (*ugly* conditional clobber-separator macros for avg and paeth, sigh)
- *
- * 20070618:
- *  - fixed misplaced PNG_THREAD_UNSAFE_OK endif (was missing LOAD_GOT_rbp/
- *     RESTORE_rbp in 32-bit thread-safe case)
- *  - changed all "ifdef *" to "if defined(*)" [GR-P]
- *
- * 20070619:
- *  - rearranged most bitdepth-related case statements to put most frequent
- *     cases at top (24-bit, 32-bit, 8-bit, rest)
- *
- * 20070623:
- *  - cleaned up png_debug() warnings/formatting
- *  - removed PNG_MMX_CODE_SUPPORTED ifdefs and added outer __GNUC__ ifdef
- *     (module no longer used by non-x86/non-GCC builds as of libpng 1.2.19)
- *  - removed single libpng-1.2.x PNG_DEBUG dependency on 1.0.x png_struct
- *     member (row_buf_size)
- *  - rearranged pass-related if-blocks in png_do_read_interlace() to put most
- *     frequent cases (4, 5) at top [GR-P suggestion]
- *
- * 20070624-29:
- *  - fixed 64-bit crash bug:  pointers -> rsi/rdi, not esi/edi (switched to
- *     %0/%1/%2/%3/%4 notation; eliminated size suffixes from relevant add/
- *     inc/sub/mov instructions; changed dummy vars to pointers)
- *     - png_combine_row()
- *     - png_do_read_interlace()
- *     - png_read_filter_row_mmx_avg()
- *     - png_read_filter_row_mmx_paeth()
- *     - png_read_filter_row_mmx_sub()
- *     - png_read_filter_row_mmx_up()
- *  - NOTE:  this fix makes use of the fact that modifying a 32-bit reg (e.g.,
- *     %%ebx) clears the top half of its corresponding 64-bit reg (%%rbx), so
- *     it's safe to mix 32-bit operations with 64-bit base/index addressing
- *     (see new PSI/PAX/PBX/PDX/PBP/etc. "pointer-register" macros); applies
- *     also to clobber lists
- *
- * 20070630:
- *  - cleaned up formatting, macros, minor png_read_filter_row_mmx_sub() 8-bpp
- *     register-usage inefficiency
- *  - fixed 32-bit png_do_read_interlace() bug (was using pointer size for
- *     64-bit dummy values)
- *
- * 20070703:
- *  - added check for (manual) PIC macro to fix OpenBSD crash bug
- *
- * 20070717:
- *  - fixed 48-bit png_combine_row() bug (was acting like 32-bit):  copy 6
- *     bytes per pixel, not 4, and use stride of 6, not 4, in the second loop
- *     of interlace processing of 48-bit pixels [GR-P]
- *
- * 20070722:
- *  - fixed 64-bit png_uint_32 bug with MMXLength/FullLength temp vars
- *
- * [still broken:  tops of all row-filter blocks (input/output constraints);
- *  shows up on 64-bit dynamic (-fPIC) version with -O2, especially if debug-
- *  printfs enabled, but at right edge of odd-width images even if disabled]
- *
- *
- * STILL TO DO:
- *  - fix final thread-unsafe code using stack vars and pointer? (paeth top,
- *     default, bottom only:  default, bottom already 5 reg constraints; could
- *     replace bpp with pointer and group bpp/patemp/pbtemp/pctemp in array)
- *  - fix ebp/no-reg-constraint inefficiency (avg/paeth/sub top)
- *  - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
- *  - write MMX code for 48-bit case (pixel_bytes == 6)
- *  - figure out what's up with 24-bit case (pixel_bytes == 3):
- *     why subtract 8 from width_mmx in the pass 4/5 case?  due to
- *     odd number of bytes? (only width_mmx case) (near line 2335)
- *  - rewrite all MMX interlacing code so it's aligned with beginning
- *     of the row buffer, not the end (see 19991007 for details)
- *  - add error messages to any remaining bogus default cases
- *  - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
- *  - try =r, etc., as reg constraints?  (would gcc use 64-bit ones on x86-64?)
- *  - need full, non-graphical, CRC-based test suite...  maybe autogenerate
- *     random data of various height/width/depth, compute CRCs, write (C
- *     funcs), read (asm/MMX), recompute CRCs, and compare?
- *  - write true x86-64 version using 128-bit "media instructions", %xmm0-15,
- *     and extra general-purpose registers
- */
-
-#if defined(__GNUC__)
-
-#define PNG_INTERNAL
-#include "png.h"
-
-
-/* for some inexplicable reason, gcc 3.3.5 on OpenBSD (and elsewhere?) does
- * *not* define __PIC__ when the -fPIC option is used, so we have to rely on
- * makefiles and whatnot to define the PIC macro explicitly */
-#if defined(PIC) && !defined(__PIC__)   // (this can/should move to pngconf.h)
-#  define __PIC__
-#endif
-
-#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && \
-    defined(PNG_USE_PNGGCCRD) && defined(PNG_MMX_CODE_SUPPORTED)
-
-/* if you want/need full thread-safety on x86-64 even when linking statically,
- * comment out the "&& defined(__PIC__)" part here: */
-#if defined(__x86_64__) && defined(__PIC__)
-#  define PNG_x86_64_USE_GOTPCREL            // GOTPCREL => full thread-safety
-#  define PNG_CLOBBER_x86_64_REGS_SUPPORTED  // works as of gcc 3.4.3 ...
-#endif
-
-int PNGAPI png_mmx_support(void);
-
-#if defined(PNG_USE_LOCAL_ARRAYS)
-static PNG_CONST int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
-static PNG_CONST int FARDATA png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
-static PNG_CONST int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
-#endif
-
-/* djgpp, Win32, Cygwin, and OS2 add their own underscores to global variables,
- * so define them without: */
-#if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__) || \
-    defined(__OS2__) || defined(__APPLE__)
-#  define _mmx_supported  mmx_supported
-#  define _mask8_0        mask8_0
-#  define _mask16_1       mask16_1
-#  define _mask16_0       mask16_0
-#  define _mask24_2       mask24_2
-#  define _mask24_1       mask24_1
-#  define _mask24_0       mask24_0
-#  define _mask32_3       mask32_3
-#  define _mask32_2       mask32_2
-#  define _mask32_1       mask32_1
-#  define _mask32_0       mask32_0
-#  define _mask48_5       mask48_5
-#  define _mask48_4       mask48_4
-#  define _mask48_3       mask48_3
-#  define _mask48_2       mask48_2
-#  define _mask48_1       mask48_1
-#  define _mask48_0       mask48_0
-#  define _amask5_3_0     amask5_3_0
-#  define _amask7_1_0     amask7_1_0
-#  define _LBCarryMask    LBCarryMask
-#  define _HBClearMask    HBClearMask
-#  define _amask0_8_0     amask0_8_0
-#  define _amask6_2_0     amask6_2_0
-#  define _amask4_4_0     amask4_4_0
-#  define _amask0_2_6     amask0_2_6
-#  define _amask2_3_3     amask2_3_3
-#  define _amask4_2_2     amask4_2_2
-#  if defined(PNG_THREAD_UNSAFE_OK)
-#    define _patemp       patemp
-#    define _pbtemp       pbtemp
-#    define _pctemp       pctemp
-#  endif
-#endif // djgpp, Win32, Cygwin, OS2
-
-
-/* These constants are used in the inlined MMX assembly code. */
-
-typedef unsigned long long  ull;
-
-#if defined(PNG_x86_64_USE_GOTPCREL)
-static PNG_CONST struct {
-    //ull _mask_array[26];
-
-    // png_combine_row() constants:
-    ull _mask8_0;
-    ull _mask16_0, _mask16_1;
-    ull _mask24_0, _mask24_1, _mask24_2;
-    ull _mask32_0, _mask32_1, _mask32_2, _mask32_3;
-    ull _mask48_0, _mask48_1, _mask48_2, _mask48_3, _mask48_4, _mask48_5;
-
-    // png_do_read_interlace() constants:
-    ull _amask5_3_0, _amask7_1_0;  // was _const4 and _const6, respectively
-
-    // png_read_filter_row_mmx_avg() constants (also uses _amask5_3_0):
-    ull _LBCarryMask, _HBClearMask;
-    ull _amask0_8_0, _amask6_2_0;  // was ActiveMask for bpp 4/6 and 2 cases
-
-    // png_read_filter_row_mmx_paeth() constants (also uses _amask5_3_0):
-    ull _amask4_4_0, _amask0_2_6;  // was ActiveMask{,End} for bpp 6/4/8 and 3
-
-    // png_read_filter_row_mmx_sub() constants:
-    ull _amask2_3_3, _amask4_2_2;  // was ActiveMask for bpp 3 and 2 cases
-
-} _c64 __attribute__((used, aligned(8))) = {
-
-    // png_combine_row() constants:
-    0x0102040810204080LL, // _mask8_0      offset 0
-
-    0x1010202040408080LL, // _mask16_0     offset 8
-    0x0101020204040808LL, // _mask16_1     offset 16
-
-    0x2020404040808080LL, // _mask24_0     offset 24
-    0x0408080810101020LL, // _mask24_1     offset 32
-    0x0101010202020404LL, // _mask24_2     offset 40
-
-    0x4040404080808080LL, // _mask32_0     offset 48
-    0x1010101020202020LL, // _mask32_1     offset 56
-    0x0404040408080808LL, // _mask32_2     offset 64
-    0x0101010102020202LL, // _mask32_3     offset 72
-
-    0x4040808080808080LL, // _mask48_0     offset 80
-    0x2020202040404040LL, // _mask48_1     offset 88
-    0x1010101010102020LL, // _mask48_2     offset 96
-    0x0404080808080808LL, // _mask48_3     offset 104
-    0x0202020204040404LL, // _mask48_4     offset 112
-    0x0101010101010202LL, // _mask48_5     offset 120
-
-    // png_do_read_interlace() constants:
-    0x0000000000FFFFFFLL, // _amask5_3_0   offset 128  (bpp 3, avg/paeth) const4
-    0x00000000000000FFLL, // _amask7_1_0   offset 136                     const6
-
-    // png_read_filter_row_mmx_avg() constants:
-    0x0101010101010101LL, // _LBCarryMask  offset 144
-    0x7F7F7F7F7F7F7F7FLL, // _HBClearMask  offset 152
-    0xFFFFFFFFFFFFFFFFLL, // _amask0_8_0   offset 160  (bpp 4/6, avg)
-    0x000000000000FFFFLL, // _amask6_2_0   offset 168  (bpp 2,   avg)
-
-    // png_read_filter_row_mmx_paeth() constants:
-    0x00000000FFFFFFFFLL, // _amask4_4_0   offset 176  (bpp 6/4/8, paeth)
-    0xFFFF000000000000LL, // _amask0_2_6   offset 184  (bpp 3, paeth)   A.M.End
-
-    // png_read_filter_row_mmx_sub() constants:
-    0x0000FFFFFF000000LL, // _amask2_3_3   offset 192  (bpp 3, sub)
-    0x00000000FFFF0000LL, // _amask4_2_2   offset 200  (bpp 2, sub)
-
-};
-
-#define MASK8_0        "(%%rbp)"
-#define MASK16_0       "8(%%rbp)"
-#define MASK16_1       "16(%%rbp)"
-#define MASK24_0       "24(%%rbp)"
-#define MASK24_1       "32(%%rbp)"
-#define MASK24_2       "40(%%rbp)"
-#define MASK32_0       "48(%%rbp)"
-#define MASK32_1       "56(%%rbp)"
-#define MASK32_2       "64(%%rbp)"
-#define MASK32_3       "72(%%rbp)"
-#define MASK48_0       "80(%%rbp)"
-#define MASK48_1       "88(%%rbp)"
-#define MASK48_2       "96(%%rbp)"
-#define MASK48_3       "104(%%rbp)"
-#define MASK48_4       "112(%%rbp)"
-#define MASK48_5       "120(%%rbp)"
-#define AMASK5_3_0     "128(%%rbp)"
-#define AMASK7_1_0     "136(%%rbp)"
-#define LB_CARRY_MASK  "144(%%rbp)"
-#define HB_CLEAR_MASK  "152(%%rbp)"
-#define AMASK0_8_0     "160(%%rbp)"
-#define AMASK6_2_0     "168(%%rbp)"
-#define AMASK4_4_0     "176(%%rbp)"
-#define AMASK0_2_6     "184(%%rbp)"
-#define AMASK2_3_3     "192(%%rbp)"
-#define AMASK4_2_2     "200(%%rbp)"
-
-#else // !PNG_x86_64_USE_GOTPCREL
-
-static PNG_CONST ull _mask8_0  __attribute__((used, aligned(8))) = 0x0102040810204080LL;
-
-static PNG_CONST ull _mask16_1 __attribute__((used, aligned(8))) = 0x0101020204040808LL;
-static PNG_CONST ull _mask16_0 __attribute__((used, aligned(8))) = 0x1010202040408080LL;
-
-static PNG_CONST ull _mask24_2 __attribute__((used, aligned(8))) = 0x0101010202020404LL;
-static PNG_CONST ull _mask24_1 __attribute__((used, aligned(8))) = 0x0408080810101020LL;
-static PNG_CONST ull _mask24_0 __attribute__((used, aligned(8))) = 0x2020404040808080LL;
-
-static PNG_CONST ull _mask32_3 __attribute__((used, aligned(8))) = 0x0101010102020202LL;
-static PNG_CONST ull _mask32_2 __attribute__((used, aligned(8))) = 0x0404040408080808LL;
-static PNG_CONST ull _mask32_1 __attribute__((used, aligned(8))) = 0x1010101020202020LL;
-static PNG_CONST ull _mask32_0 __attribute__((used, aligned(8))) = 0x4040404080808080LL;
-
-static PNG_CONST ull _mask48_5 __attribute__((used, aligned(8))) = 0x0101010101010202LL;
-static PNG_CONST ull _mask48_4 __attribute__((used, aligned(8))) = 0x0202020204040404LL;
-static PNG_CONST ull _mask48_3 __attribute__((used, aligned(8))) = 0x0404080808080808LL;
-static PNG_CONST ull _mask48_2 __attribute__((used, aligned(8))) = 0x1010101010102020LL;
-static PNG_CONST ull _mask48_1 __attribute__((used, aligned(8))) = 0x2020202040404040LL;
-static PNG_CONST ull _mask48_0 __attribute__((used, aligned(8))) = 0x4040808080808080LL;
-
-// png_do_read_interlace() constants:
-static PNG_CONST ull _amask5_3_0  __attribute__((aligned(8))) = 0x0000000000FFFFFFLL;  // was _const4
-static PNG_CONST ull _amask7_1_0  __attribute__((aligned(8))) = 0x00000000000000FFLL;  // was _const6
-
-// png_read_filter_row_mmx_avg() constants:
-static PNG_CONST ull _LBCarryMask __attribute__((used, aligned(8))) = 0x0101010101010101LL;
-static PNG_CONST ull _HBClearMask __attribute__((used, aligned(8))) = 0x7f7f7f7f7f7f7f7fLL;
-static PNG_CONST ull _amask0_8_0  __attribute__((used, aligned(8))) = 0xFFFFFFFFFFFFFFFFLL;
-static PNG_CONST ull _amask6_2_0  __attribute__((used, aligned(8))) = 0x000000000000FFFFLL;
-
-// png_read_filter_row_mmx_paeth() constants:
-static PNG_CONST ull _amask4_4_0  __attribute__((used, aligned(8))) = 0x00000000FFFFFFFFLL;
-static PNG_CONST ull _amask0_2_6  __attribute__((used, aligned(8))) = 0xFFFF000000000000LL;
-
-// png_read_filter_row_mmx_sub() constants:
-static PNG_CONST ull _amask2_3_3  __attribute__((used, aligned(8))) = 0x0000FFFFFF000000LL;
-static PNG_CONST ull _amask4_2_2  __attribute__((used, aligned(8))) = 0x00000000FFFF0000LL;
-
-#define MASK8_0        "_mask8_0"
-#define MASK16_0       "_mask16_0"
-#define MASK16_1       "_mask16_1"
-#define MASK24_0       "_mask24_0"
-#define MASK24_1       "_mask24_1"
-#define MASK24_2       "_mask24_2"
-#define MASK32_0       "_mask32_0"
-#define MASK32_1       "_mask32_1"
-#define MASK32_2       "_mask32_2"
-#define MASK32_3       "_mask32_3"
-#define MASK48_0       "_mask48_0"
-#define MASK48_1       "_mask48_1"
-#define MASK48_2       "_mask48_2"
-#define MASK48_3       "_mask48_3"
-#define MASK48_4       "_mask48_4"
-#define MASK48_5       "_mask48_5"
-#define AMASK5_3_0     "_amask5_3_0"
-#define AMASK7_1_0     "_amask7_1_0"
-#define LB_CARRY_MASK  "_LBCarryMask"
-#define HB_CLEAR_MASK  "_HBClearMask"
-#define AMASK0_8_0     "_amask0_8_0"
-#define AMASK6_2_0     "_amask6_2_0"
-#define AMASK4_4_0     "_amask4_4_0"
-#define AMASK0_2_6     "_amask0_2_6"
-#define AMASK2_3_3     "_amask2_3_3"
-#define AMASK4_2_2     "_amask4_2_2"
-
-#endif // ?PNG_x86_64_USE_GOTPCREL
-
-
-#if defined(PNG_HAVE_MMX_READ_FILTER_ROW) || defined(PNG_HAVE_MMX_COMBINE_ROW)
-
-// this block is specific to png_read_filter_row_mmx_paeth() except for
-// LOAD_GOT_rbp and RESTORE_rbp, which are also used in png_combine_row()
-#if defined(PNG_x86_64_USE_GOTPCREL)
-#  define pa_TEMP                "%%r11d"
-#  define pb_TEMP                "%%r12d"
-#  define pc_TEMP                "%%r13d"
-#  if defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED)  // works as of gcc 3.4.3 ...
-#    define SAVE_r11_r12_r13
-#    define RESTORE_r11_r12_r13
-#    define _CLOBBER_r11_r12_r13 ,"%r11", "%r12", "%r13"
-#    define CLOBBER_r11_r12_r13  "%r11", "%r12", "%r13"
-#  else // !PNG_CLOBBER_x86_64_REGS_SUPPORTED
-#    define SAVE_r11_r12_r13     "pushq %%r11  \n\t" \
-                                 "pushq %%r12  \n\t" \
-                                 "pushq %%r13  \n\t"  // "normally 0-extended"
-#    define RESTORE_r11_r12_r13  "popq  %%r13  \n\t" \
-                                 "popq  %%r12  \n\t" \
-                                 "popq  %%r11  \n\t"
-#    define _CLOBBER_r11_r12_r13
-#    define CLOBBER_r11_r12_r13
-#  endif
-#  define LOAD_GOT_rbp           "pushq %%rbp                        \n\t" \
-                                 "movq  _c64@GOTPCREL(%%rip), %%rbp  \n\t"
-#  define RESTORE_rbp            "popq  %%rbp                        \n\t"
-#else // 32-bit and/or non-PIC
-#  if defined(PNG_THREAD_UNSAFE_OK)
-     // These variables are used in png_read_filter_row_mmx_paeth() and would be
-     //   local variables if not for gcc-inline-assembly addressing limitations
-     //   (some apparently related to ELF format, others to CPU type).
-     //
-     // WARNING: Their presence defeats the thread-safety of libpng.
-     static int                     _patemp  __attribute__((used));
-     static int                     _pbtemp  __attribute__((used));
-     static int                     _pctemp  __attribute__((used));
-#    define pa_TEMP                "_patemp"
-#    define pb_TEMP                "_pbtemp"  // temp variables for
-#    define pc_TEMP                "_pctemp"  //  Paeth routine
-#    define SAVE_r11_r12_r13
-#    define RESTORE_r11_r12_r13
-#    define _CLOBBER_r11_r12_r13   // not using regs => not clobbering
-#    define CLOBBER_r11_r12_r13
-#  endif // PNG_THREAD_UNSAFE_OK
-#  define LOAD_GOT_rbp
-#  define RESTORE_rbp
-#endif
-
-#if defined(__x86_64__)
-#  define SAVE_ebp
-#  define RESTORE_ebp
-#  define _CLOBBER_ebp         ,"%ebp"
-#  define CLOBBER_ebp          "%ebp"
-#  define SAVE_FullLength      "movl %%eax, %%r15d  \n\t"
-#  define RESTORE_FullLength   "movl %%r15d, "     // may go into eax or ecx
-#  if defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED)   // works as of gcc 3.4.3 ...
-#    define SAVE_r15
-#    define RESTORE_r15
-#    define _CLOBBER_r15       ,"%r15"
-#  else
-#    define SAVE_r15           "pushq %%r15  \n\t"
-#    define RESTORE_r15        "popq  %%r15  \n\t"
-#    define _CLOBBER_r15
-#  endif
-#  define PBP                  "%%rbp"             // regs used for 64-bit
-#  define PAX                  "%%rax"             //  pointers or in
-#  define PBX                  "%%rbx"             //  combination with
-#  define PCX                  "%%rcx"             //  64-bit pointer-regs
-#  define PDX                  "%%rdx"             //  (base/index pairs,
-#  define PSI                  "%%rsi"             //  add/sub/mov pairs)
-#  define CLEAR_BOTTOM_3_BITS  "and  $0xfffffffffffffff8, "
-#else
-#  define SAVE_ebp             "pushl %%ebp \n\t"  // clobber list doesn't work
-#  define RESTORE_ebp          "popl  %%ebp \n\t"  //  for %ebp on 32-bit; not
-#  define _CLOBBER_ebp                             //  clear why not
-#  define CLOBBER_ebp
-#  define SAVE_FullLength      "pushl %%eax \n\t"
-#  define RESTORE_FullLength   "popl "             // eax (avg) or ecx (paeth)
-#  define SAVE_r15
-#  define RESTORE_r15
-#  define _CLOBBER_r15
-#  define PBP                  "%%ebp"             // regs used for or in
-#  define PAX                  "%%eax"             //  combination with
-#  define PBX                  "%%ebx"             //  "normal," 32-bit
-#  define PCX                  "%%ecx"             //  pointers
-#  define PDX                  "%%edx"
-#  define PSI                  "%%esi"
-#  define CLEAR_BOTTOM_3_BITS  "and  $0xfffffff8, "
-#endif
-
-// CLOB_COMMA_ebx_ebp:  need comma ONLY if both CLOBBER_ebp and CLOBBER_GOT_ebx
-//                      have values, i.e., only if __x86_64__ AND !__PIC__
-#if defined(__x86_64__) && !defined(__PIC__)
-#  define CLOB_COMMA_ebx_ebp    , // clobbering both ebp and ebx => need comma
-#else
-#  define CLOB_COMMA_ebx_ebp
-#endif
-
-// CLOB_COMMA_ebX_r1X:  need comma UNLESS both CLOBBER_ebp and CLOBBER_GOT_ebx
-//                   are empty OR CLOBBER_r11_r12_r13 is empty--i.e., NO comma
-//                   if (!__x86_64__ AND __PIC__) OR !(PNG_x86_64_USE_GOTPCREL
-//                   AND PNG_CLOBBER_x86_64_REGS_SUPPORTED)   (double sigh...)
-#if (!defined(__x86_64__) && defined(__PIC__)) || \
-    !defined(PNG_x86_64_USE_GOTPCREL) || \
-    !defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED)
-#  define CLOB_COMMA_ebX_r1X
-#else
-#  define CLOB_COMMA_ebX_r1X    , // clobbering (ebp OR ebx) AND r11_r12_r13
-#endif
-
-// CLOB_COLON_ebx_ebp:  need colon unless CLOBBER_ebp and CLOBBER_GOT_ebx are
-//                      BOTH empty--i.e., NO colon if (!__x86_64__ AND __PIC__)
-// CLOB_COLON_ebx_ebp_r1X:  if, in addition, CLOBBER_r11_r12_r13 is empty, then
-//                          no colon for Paeth blocks, either--i.e., NO colon
-//                          if !(PNG_x86_64_USE_GOTPCREL AND
-//                               PNG_CLOBBER_x86_64_REGS_SUPPORTED)
-#if (!defined(__x86_64__) && defined(__PIC__))
-#  define CLOB_COLON_ebx_ebp
-#  if !(defined(PNG_x86_64_USE_GOTPCREL) && \
-        defined(PNG_CLOBBER_x86_64_REGS_SUPPORTED))
-#    define CLOB_COLON_ebx_ebp_r1X
-#  else
-#    define CLOB_COLON_ebx_ebp_r1X  : // clobbering ebp OR ebx OR r11_r12_r13
-#  endif
-#else
-#  define CLOB_COLON_ebx_ebp        : // clobbering ebp OR ebx
-#  define CLOB_COLON_ebx_ebp_r1X    : // clobbering ebp OR ebx OR r11_r12_r13
-#endif
-
-#endif // PNG_HAVE_MMX_READ_FILTER_ROW
-
-#if defined(__PIC__)  // macros to save, restore index to Global Offset Table
-#  if defined(__x86_64__)
-#    define SAVE_GOT_ebx     "pushq %%rbx \n\t"
-#    define RESTORE_GOT_ebx  "popq  %%rbx \n\t"
-#  else
-#    define SAVE_GOT_ebx     "pushl %%ebx \n\t"
-#    define RESTORE_GOT_ebx  "popl  %%ebx \n\t"
-#  endif
-#  define _CLOBBER_GOT_ebx   // explicitly saved, restored => not clobbered
-#  define CLOBBER_GOT_ebx
-#else
-#  define SAVE_GOT_ebx
-#  define RESTORE_GOT_ebx
-#  define _CLOBBER_GOT_ebx   ,"%ebx"
-#  define CLOBBER_GOT_ebx    "%ebx"
-#endif
-
-#if defined(PNG_HAVE_MMX_COMBINE_ROW) || defined(PNG_HAVE_MMX_READ_INTERLACE)
-#  define BPP2  2
-#  define BPP3  3  // bytes per pixel (a.k.a. pixel_bytes)
-#  define BPP4  4  // (defined only to help avoid cut-and-paste errors)
-#  define BPP6  6
-#  define BPP8  8
-#endif
-
-
-
-static int _mmx_supported = 2; // 0: no MMX; 1: MMX supported; 2: not tested
-
-/*===========================================================================*/
-/*                                                                           */
-/*                      P N G _ M M X _ S U P P O R T                        */
-/*                                                                           */
-/*===========================================================================*/
-
-// GRR NOTES:  (1) the following code assumes 386 or better (pushfl/popfl)
-//             (2) all instructions compile with gcc 2.7.2.3 and later
-//           x (3) the function is moved down here to prevent gcc from
-//           x      inlining it in multiple places and then barfing be-
-//           x      cause the ".NOT_SUPPORTED" label is multiply defined
-//                  [need to retest with gcc 2.7.2.3]
-
-// GRR 20070524:  This declaration apparently is compatible with but supersedes
-//   the one in png.h; in any case, the generated object file is slightly
-//   smaller.  It is unnecessary with gcc 4.1.2, but gcc 2.x apparently
-//   replicated the ".NOT_SUPPORTED" label in each location the function was
-//   inlined, leading to compilation errors due to the "multiply defined"
-//   label.  Old workaround was to leave the function at the end of this
-//   file; new one (still testing) is to use a gcc-specific function attribute
-//   to prevent local inlining.
-int PNGAPI
-png_mmx_support(void) __attribute__((noinline));
-
-int PNGAPI
-png_mmx_support(void)
-{
-    int result;
-#if defined(PNG_MMX_CODE_SUPPORTED)  // superfluous, but what the heck
-    __asm__ __volatile__ (
-#if defined(__x86_64__)
-        "pushq %%rbx          \n\t"  // rbx gets clobbered by CPUID instruction
-        "pushq %%rcx          \n\t"  // so does rcx...
-        "pushq %%rdx          \n\t"  // ...and rdx (but rcx & rdx safe on Linux)
-        "pushfq               \n\t"  // save Eflag to stack
-        "popq %%rax           \n\t"  // get Eflag from stack into rax
-        "movq %%rax, %%rcx    \n\t"  // make another copy of Eflag in rcx
-        "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
-        "pushq %%rax          \n\t"  // save modified Eflag back to stack
-        "popfq                \n\t"  // restore modified value to Eflag reg
-        "pushfq               \n\t"  // save Eflag to stack
-        "popq %%rax           \n\t"  // get Eflag from stack
-        "pushq %%rcx          \n\t"  // save original Eflag to stack
-        "popfq                \n\t"  // restore original Eflag
-#else
-        "pushl %%ebx          \n\t"  // ebx gets clobbered by CPUID instruction
-        "pushl %%ecx          \n\t"  // so does ecx...
-        "pushl %%edx          \n\t"  // ...and edx (but ecx & edx safe on Linux)
-        "pushfl               \n\t"  // save Eflag to stack
-        "popl %%eax           \n\t"  // get Eflag from stack into eax
-        "movl %%eax, %%ecx    \n\t"  // make another copy of Eflag in ecx
-        "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
-        "pushl %%eax          \n\t"  // save modified Eflag back to stack
-        "popfl                \n\t"  // restore modified value to Eflag reg
-        "pushfl               \n\t"  // save Eflag to stack
-        "popl %%eax           \n\t"  // get Eflag from stack
-        "pushl %%ecx          \n\t"  // save original Eflag to stack
-        "popfl                \n\t"  // restore original Eflag
-#endif
-        "xorl %%ecx, %%eax    \n\t"  // compare new Eflag with original Eflag
-        "jz 0f                \n\t"  // if same, CPUID instr. is not supported
-
-        "xorl %%eax, %%eax    \n\t"  // set eax to zero
-//      ".byte  0x0f, 0xa2    \n\t"  // CPUID instruction (two-byte opcode)
-        "cpuid                \n\t"  // get the CPU identification info
-        "cmpl $1, %%eax       \n\t"  // make sure eax return non-zero value
-        "jl 0f                \n\t"  // if eax is zero, MMX is not supported
-
-        "xorl %%eax, %%eax    \n\t"  // set eax to zero and...
-        "incl %%eax           \n\t"  // ...increment eax to 1.  This pair is
-                                     // faster than the instruction "mov eax, 1"
-        "cpuid                \n\t"  // get the CPU identification info again
-        "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
-        "cmpl $0, %%edx       \n\t"  // 0 = MMX not supported
-        "jz 0f                \n\t"  // non-zero = yes, MMX IS supported
-
-        "movl $1, %%eax       \n\t"  // set return value to 1
-        "jmp  1f              \n\t"  // DONE:  have MMX support
-
-    "0:                       \n\t"  // .NOT_SUPPORTED: target label for jump instructions
-        "movl $0, %%eax       \n\t"  // set return value to 0
-    "1:                       \n\t"  // .RETURN: target label for jump instructions
-#if defined(__x86_64__)
-        "popq %%rdx           \n\t"  // restore rdx
-        "popq %%rcx           \n\t"  // restore rcx
-        "popq %%rbx           \n\t"  // restore rbx
-#else
-        "popl %%edx           \n\t"  // restore edx
-        "popl %%ecx           \n\t"  // restore ecx
-        "popl %%ebx           \n\t"  // restore ebx
-#endif
-
-//      "ret                  \n\t"  // DONE:  no MMX support
-                                     // (fall through to standard C "ret")
-
-        : "=a" (result)              // output list
-
-        :                            // any variables used on input (none)
-
-                                     // no clobber list
-//      , "%ebx", "%ecx", "%edx"     // GRR:  we handle these manually
-//      , "memory"   // if write to a variable gcc thought was in a reg
-//      , "cc"       // "condition codes" (flag bits)
-    );
-    _mmx_supported = result;
-#else
-    _mmx_supported = 0;
-#endif /* PNG_MMX_CODE_SUPPORTED */
-
-    return _mmx_supported;
-}
-
-
-/*===========================================================================*/
-/*                                                                           */
-/*                       P N G _ C O M B I N E _ R O W                       */
-/*                                                                           */
-/*===========================================================================*/
-
-#if defined(PNG_HAVE_MMX_COMBINE_ROW)
-
-/* Combines the row recently read in with the previous row.
-   This routine takes care of alpha and transparency if requested.
-   This routine also handles the two methods of progressive display
-   of interlaced images, depending on the mask value.
-   The mask value describes which pixels are to be combined with
-   the row.  The pattern always repeats every 8 pixels, so just 8
-   bits are needed.  A one indicates the pixel is to be combined; a
-   zero indicates the pixel is to be skipped.  This is in addition
-   to any alpha or transparency value associated with the pixel.
-   If you want all pixels to be combined, pass 0xff (255) in mask. */
-
-/* Use this routine for the x86 platform - it uses a faster MMX routine
-   if the machine supports MMX. */
-
-void /* PRIVATE */
-png_combine_row(png_structp png_ptr, png_bytep row, int mask)
-{
-   int dummy_value_a;    // fix 'forbidden register spilled' error
-   int dummy_value_c;
-   int dummy_value_d;
-   png_bytep dummy_value_S;
-   png_bytep dummy_value_D;
-
-   png_debug(1, "in png_combine_row (pnggccrd.c)\n");
-
-   if (_mmx_supported == 2) {
-#if !defined(PNG_1_0_X)
-       /* this should have happened in png_init_mmx_flags() already */
-       png_warning(png_ptr, "asm_flags may not have been initialized");
-#endif
-       png_mmx_support();
-   }
-
-   if (mask == 0xff)
-   {
-      png_debug(2,"mask == 0xff:  doing single png_memcpy()\n");
-      png_memcpy(row, png_ptr->row_buf + 1,
-       (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
-   }
-   else   /* (png_combine_row() is never called with mask == 0) */
-   {
-      switch (png_ptr->row_info.pixel_depth)
-      {
-         case 24:       /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-
-#if !defined(PNG_1_0_X)
-            if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
-#else
-            if (_mmx_supported)
-#endif
-            {
-               png_uint_32 len;
-               int diff;
-
-               srcptr = png_ptr->row_buf + 1;
-               dstptr = row;
-               len  = png_ptr->width & ~7;          // reduce to multiple of 8
-               diff = (int) (png_ptr->width & 7);   // amount lost
-
-               __asm__ __volatile__ (
-                  "not       %%edx            \n\t" // mask => unmask
-                  "movd      %%edx, %%mm7     \n\t" // load bit pattern
-                  "not       %%edx            \n\t" // unmask => mask for later
-                  "psubb     %%mm6, %%mm6     \n\t" // zero mm6
-                  "punpcklbw %%mm7, %%mm7     \n\t"
-                  "punpcklwd %%mm7, %%mm7     \n\t"
-                  "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
-
-                  LOAD_GOT_rbp
-                  "movq   " MASK24_0 ", %%mm0 \n\t" // _mask24_0 -> mm0
-                  "movq   " MASK24_1 ", %%mm1 \n\t" // _mask24_1 -> mm1
-                  "movq   " MASK24_2 ", %%mm2 \n\t" // _mask24_2 -> mm2
-                  RESTORE_rbp
-
-                  "pand      %%mm7, %%mm0     \n\t"
-                  "pand      %%mm7, %%mm1     \n\t"
-                  "pand      %%mm7, %%mm2     \n\t"
-
-                  "pcmpeqb   %%mm6, %%mm0     \n\t"
-                  "pcmpeqb   %%mm6, %%mm1     \n\t"
-                  "pcmpeqb   %%mm6, %%mm2     \n\t"
-
-// preload        "movl      len, %%ecx       \n\t" // load length of line
-// preload        "movl      srcptr, %3       \n\t" // load source
-// preload        "movl      dstptr, %4       \n\t" // load dest
-
-                  "cmpl      $0, %%ecx        \n\t"
-                  "jz        mainloop24end    \n\t"
-
-                "mainloop24:                  \n\t"
-                  "movq      (%3), %%mm4      \n\t"
-                  "pand      %%mm0, %%mm4     \n\t"
-                  "movq      %%mm0, %%mm6     \n\t"
-                  "movq      (%4), %%mm7      \n\t"
-                  "pandn     %%mm7, %%mm6     \n\t"
-                  "por       %%mm6, %%mm4     \n\t"
-                  "movq      %%mm4, (%4)      \n\t"
-
-                  "movq      8(%3), %%mm5     \n\t"
-                  "pand      %%mm1, %%mm5     \n\t"
-                  "movq      %%mm1, %%mm7     \n\t"
-                  "movq      8(%4), %%mm6     \n\t"
-                  "pandn     %%mm6, %%mm7     \n\t"
-                  "por       %%mm7, %%mm5     \n\t"
-                  "movq      %%mm5, 8(%4)     \n\t"
-
-                  "movq      16(%3), %%mm6    \n\t"
-                  "pand      %%mm2, %%mm6     \n\t"
-                  "movq      %%mm2, %%mm4     \n\t"
-                  "movq      16(%4), %%mm7    \n\t"
-                  "pandn     %%mm7, %%mm4     \n\t"
-                  "por       %%mm4, %%mm6     \n\t"
-                  "movq      %%mm6, 16(%4)    \n\t"
-
-                  "add       $24, %3          \n\t" // inc by 24 bytes processed
-                  "add       $24, %4          \n\t"
-                  "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
-
-                  "ja        mainloop24       \n\t"
-
-                "mainloop24end:               \n\t"
-// preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
-                  "movl      %%eax, %%ecx     \n\t"
-                  "cmpl      $0, %%ecx        \n\t"
-                  "jz        end24            \n\t"
-// preload        "movl      mask, %%edx      \n\t"
-                  "sall      $24, %%edx       \n\t" // make low byte, high byte
-
-                "secondloop24:                \n\t"
-                  "sall      %%edx            \n\t" // move high bit to CF
-                  "jnc       skip24           \n\t" // if CF = 0
-                  "movw      (%3), %%ax       \n\t"
-                  "movw      %%ax, (%4)       \n\t"
-                  "xorl      %%eax, %%eax     \n\t"
-                  "movb      2(%3), %%al      \n\t"
-                  "movb      %%al, 2(%4)      \n\t"
-
-                "skip24:                      \n\t"
-                  "add       $3, %3           \n\t"
-                  "add       $3, %4           \n\t"
-                  "decl      %%ecx            \n\t"
-                  "jnz       secondloop24     \n\t"
-
-                "end24:                       \n\t"
-                  "EMMS                       \n\t" // DONE
-
-                  : "=a" (dummy_value_a),           // output regs (dummy)
-                    "=d" (dummy_value_d),
-                    "=c" (dummy_value_c),
-                    "=S" (dummy_value_S),
-                    "=D" (dummy_value_D)
-
-                  : "0" (diff),        // eax       // input regs
-                    "1" (mask),        // edx
-                    "2" (len),         // ecx
-// was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
-                    "3" (srcptr),      // esi/rsi
-                    "4" (dstptr)       // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                  : "%mm0", "%mm1", "%mm2"          // clobber list
-                  , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-               );
-            }
-            else /* not _mmx_supported - use modified C routine */
-            {
-               register png_uint_32 i;
-               png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
-                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-               register int stride = BPP3 * png_pass_inc[png_ptr->pass];
-                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-               register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
-                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
-               int diff = (int) (png_ptr->width & 7); /* amount lost */
-               register png_uint_32 final_val = BPP3 * len;   /* GRR bugfix */
-
-               srcptr = png_ptr->row_buf + 1 + initial_val;
-               dstptr = row + initial_val;
-
-               for (i = initial_val; i < final_val; i += stride)
-               {
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-               if (diff)  /* number of leftover pixels:  3 for pngtest */
-               {
-                  final_val += diff*BPP3;
-                  for (; i < final_val; i += stride)
-                  {
-                     if (rep_bytes > (int)(final_val-i))
-                        rep_bytes = (int)(final_val-i);
-                     png_memcpy(dstptr, srcptr, rep_bytes);
-                     srcptr += stride;
-                     dstptr += stride;
-                  }
-               }
-            } /* end of else (_mmx_supported) */
-
-            break;
-         }       /* end 24 bpp */
-
-         // formerly claimed to be most common case (combining 32-bit RGBA),
-         // but almost certainly less common than 24-bit RGB case
-         case 32:       /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-
-#if !defined(PNG_1_0_X)
-            if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
-#else
-            if (_mmx_supported)
-#endif
-            {
-               png_uint_32 len;
-               int diff;
-
-               srcptr = png_ptr->row_buf + 1;
-               dstptr = row;
-               len  = png_ptr->width & ~7;          // reduce to multiple of 8
-               diff = (int) (png_ptr->width & 7);   // amount lost
-
-               __asm__ __volatile__ (
-                  "not       %%edx            \n\t" // mask => unmask
-                  "movd      %%edx, %%mm7     \n\t" // load bit pattern
-                  "not       %%edx            \n\t" // unmask => mask for later
-                  "psubb     %%mm6, %%mm6     \n\t" // zero mm6
-                  "punpcklbw %%mm7, %%mm7     \n\t"
-                  "punpcklwd %%mm7, %%mm7     \n\t"
-                  "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
-
-                  LOAD_GOT_rbp
-                  "movq   " MASK32_0 ", %%mm0 \n\t" // _mask32_0
-                  "movq   " MASK32_1 ", %%mm1 \n\t" // _mask32_1
-                  "movq   " MASK32_2 ", %%mm2 \n\t" // _mask32_2
-                  "movq   " MASK32_3 ", %%mm3 \n\t" // _mask32_3
-                  RESTORE_rbp
-
-                  "pand      %%mm7, %%mm0     \n\t"
-                  "pand      %%mm7, %%mm1     \n\t"
-                  "pand      %%mm7, %%mm2     \n\t"
-                  "pand      %%mm7, %%mm3     \n\t"
-
-                  "pcmpeqb   %%mm6, %%mm0     \n\t"
-                  "pcmpeqb   %%mm6, %%mm1     \n\t"
-                  "pcmpeqb   %%mm6, %%mm2     \n\t"
-                  "pcmpeqb   %%mm6, %%mm3     \n\t"
-
-// preload        "movl      len, %%ecx       \n\t" // load length of line
-// preload        "movl      srcptr, %3       \n\t" // load source
-// preload        "movl      dstptr, %4       \n\t" // load dest
-
-                  "cmpl      $0, %%ecx        \n\t" // lcr
-                  "jz        mainloop32end    \n\t"
-
-                "mainloop32:                  \n\t"
-                  "movq      (%3), %%mm4      \n\t"
-                  "pand      %%mm0, %%mm4     \n\t"
-                  "movq      %%mm0, %%mm6     \n\t"
-                  "movq      (%4), %%mm7      \n\t"
-                  "pandn     %%mm7, %%mm6     \n\t"
-                  "por       %%mm6, %%mm4     \n\t"
-                  "movq      %%mm4, (%4)      \n\t"
-
-                  "movq      8(%3), %%mm5     \n\t"
-                  "pand      %%mm1, %%mm5     \n\t"
-                  "movq      %%mm1, %%mm7     \n\t"
-                  "movq      8(%4), %%mm6     \n\t"
-                  "pandn     %%mm6, %%mm7     \n\t"
-                  "por       %%mm7, %%mm5     \n\t"
-                  "movq      %%mm5, 8(%4)     \n\t"
-
-                  "movq      16(%3), %%mm6    \n\t"
-                  "pand      %%mm2, %%mm6     \n\t"
-                  "movq      %%mm2, %%mm4     \n\t"
-                  "movq      16(%4), %%mm7    \n\t"
-                  "pandn     %%mm7, %%mm4     \n\t"
-                  "por       %%mm4, %%mm6     \n\t"
-                  "movq      %%mm6, 16(%4)    \n\t"
-
-                  "movq      24(%3), %%mm7    \n\t"
-                  "pand      %%mm3, %%mm7     \n\t"
-                  "movq      %%mm3, %%mm5     \n\t"
-                  "movq      24(%4), %%mm4    \n\t"
-                  "pandn     %%mm4, %%mm5     \n\t"
-                  "por       %%mm5, %%mm7     \n\t"
-                  "movq      %%mm7, 24(%4)    \n\t"
-
-                  "add       $32, %3          \n\t" // inc by 32 bytes processed
-                  "add       $32, %4          \n\t"
-                  "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
-                  "ja        mainloop32       \n\t"
-
-                "mainloop32end:               \n\t"
-// preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
-                  "movl      %%eax, %%ecx     \n\t"
-                  "cmpl      $0, %%ecx        \n\t"
-                  "jz        end32            \n\t"
-// preload        "movl      mask, %%edx      \n\t"
-                  "sall      $24, %%edx       \n\t" // low byte => high byte
-
-                "secondloop32:                \n\t"
-                  "sall      %%edx            \n\t" // move high bit to CF
-                  "jnc       skip32           \n\t" // if CF = 0
-                  "movl      (%3), %%eax      \n\t"
-                  "movl      %%eax, (%4)      \n\t"
-
-                "skip32:                      \n\t"
-                  "add       $4, %3           \n\t"
-                  "add       $4, %4           \n\t"
-                  "decl      %%ecx            \n\t"
-                  "jnz       secondloop32     \n\t"
-
-                "end32:                       \n\t"
-                  "EMMS                       \n\t" // DONE
-
-                  : "=a" (dummy_value_a),           // output regs (dummy)
-                    "=d" (dummy_value_d),
-                    "=c" (dummy_value_c),
-                    "=S" (dummy_value_S),
-                    "=D" (dummy_value_D)
-
-                  : "0" (diff),        // eax       // input regs
-                    "1" (mask),        // edx
-                    "2" (len),         // ecx
-// was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
-                    "3" (srcptr),      // esi/rsi
-                    "4" (dstptr)       // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                  : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
-                  , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-               );
-            }
-            else /* not _mmx_supported - use modified C routine */
-            {
-               register png_uint_32 i;
-               png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
-                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-               register int stride = BPP4 * png_pass_inc[png_ptr->pass];
-                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-               register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
-                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
-               int diff = (int) (png_ptr->width & 7); /* amount lost */
-               register png_uint_32 final_val = BPP4 * len;   /* GRR bugfix */
-
-               srcptr = png_ptr->row_buf + 1 + initial_val;
-               dstptr = row + initial_val;
-
-               for (i = initial_val; i < final_val; i += stride)
-               {
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-               if (diff)  /* number of leftover pixels:  3 for pngtest */
-               {
-                  final_val += diff*BPP4;
-                  for (; i < final_val; i += stride)
-                  {
-                     if (rep_bytes > (int)(final_val-i))
-                        rep_bytes = (int)(final_val-i);
-                     png_memcpy(dstptr, srcptr, rep_bytes);
-                     srcptr += stride;
-                     dstptr += stride;
-                  }
-               }
-            } /* end of else (_mmx_supported) */
-
-            break;
-         }       /* end 32 bpp */
-
-         case 8:        /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-
-#if !defined(PNG_1_0_X)
-            if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
-#else
-            if (_mmx_supported)
-#endif
-            {
-               png_uint_32 len;
-               int diff;
-
-               srcptr = png_ptr->row_buf + 1;
-               dstptr = row;
-               len  = png_ptr->width & ~7;          // reduce to multiple of 8
-               diff = (int) (png_ptr->width & 7);   // amount lost
-
-               __asm__ __volatile__ (
-                  "not       %%edx            \n\t" // mask => unmask
-                  "movd      %%edx, %%mm7     \n\t" // load bit pattern
-                  "not       %%edx            \n\t" // unmask => mask for later
-                  "psubb     %%mm6, %%mm6     \n\t" // zero mm6
-                  "punpcklbw %%mm7, %%mm7     \n\t"
-                  "punpcklwd %%mm7, %%mm7     \n\t"
-                  "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
-
-                  LOAD_GOT_rbp
-                  "movq   " MASK8_0 ", %%mm0  \n\t" // _mask8_0 -> mm0
-                  RESTORE_rbp
-
-                  "pand      %%mm7, %%mm0     \n\t" // nonzero if keep byte
-                  "pcmpeqb   %%mm6, %%mm0     \n\t" // zeros->1s, v versa
-
-// preload        "movl      len, %%ecx       \n\t" // load length of line
-// preload        "movl      srcptr, %3       \n\t" // load source
-// preload        "movl      dstptr, %4       \n\t" // load dest
-
-                  "cmpl      $0, %%ecx        \n\t" // len == 0 ?
-                  "je        mainloop8end     \n\t"
-
-                "mainloop8:                   \n\t"
-                  "movq      (%3), %%mm4      \n\t" // *srcptr
-                  "pand      %%mm0, %%mm4     \n\t"
-                  "movq      %%mm0, %%mm6     \n\t"
-                  "pandn     (%4), %%mm6      \n\t" // *dstptr
-                  "por       %%mm6, %%mm4     \n\t"
-                  "movq      %%mm4, (%4)      \n\t"
-                  "add       $8, %3           \n\t" // inc by 8 bytes processed
-                  "add       $8, %4           \n\t"
-                  "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
-                  "ja        mainloop8        \n\t"
-
-                "mainloop8end:                \n\t"
-// preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
-                  "movl      %%eax, %%ecx     \n\t"
-                  "cmpl      $0, %%ecx        \n\t"
-                  "jz        end8             \n\t"
-// preload        "movl      mask, %%edx      \n\t"
-                  "sall      $24, %%edx       \n\t" // make low byte, high byte
-
-                "secondloop8:                 \n\t"
-                  "sall      %%edx            \n\t" // move high bit to CF
-                  "jnc       skip8            \n\t" // if CF = 0
-                  "movb      (%3), %%al       \n\t"
-                  "movb      %%al, (%4)       \n\t"
-
-                "skip8:                       \n\t"
-                  "inc       %3               \n\t"
-                  "inc       %4               \n\t"
-                  "decl      %%ecx            \n\t"
-                  "jnz       secondloop8      \n\t"
-
-                "end8:                        \n\t"
-                  "EMMS                       \n\t" // DONE
-
-                  : "=a" (dummy_value_a),           // output regs (dummy)
-                    "=d" (dummy_value_d),
-                    "=c" (dummy_value_c),
-                    "=S" (dummy_value_S),
-                    "=D" (dummy_value_D)
-
-                  : "0" (diff),        // eax       // input regs
-                    "1" (mask),        // edx
-                    "2" (len),         // ecx
-// was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
-                    "3" (srcptr),      // esi/rsi
-                    "4" (dstptr)       // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                  : "%mm0", "%mm4", "%mm6", "%mm7"  // clobber list
-#endif
-               );
-            }
-            else /* not _mmx_supported - use modified C routine */
-            {
-               register png_uint_32 i;
-               png_uint_32 initial_val = png_pass_start[png_ptr->pass];
-                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-               register int stride = png_pass_inc[png_ptr->pass];
-                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-               register int rep_bytes = png_pass_width[png_ptr->pass];
-                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
-               int diff = (int) (png_ptr->width & 7); /* amount lost */
-               register png_uint_32 final_val = len;  /* GRR bugfix */
-
-               srcptr = png_ptr->row_buf + 1 + initial_val;
-               dstptr = row + initial_val;
-
-               for (i = initial_val; i < final_val; i += stride)
-               {
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-               if (diff)  /* number of leftover pixels:  3 for pngtest */
-               {
-                  final_val += diff /* *BPP1 */ ;
-                  for (; i < final_val; i += stride)
-                  {
-                     if (rep_bytes > (int)(final_val-i))
-                        rep_bytes = (int)(final_val-i);
-                     png_memcpy(dstptr, srcptr, rep_bytes);
-                     srcptr += stride;
-                     dstptr += stride;
-                  }
-               }
-
-            } /* end of else (_mmx_supported) */
-
-            break;
-         }       /* end 8 bpp */
-
-         case 1:        /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep sp;
-            png_bytep dp;
-            int s_inc, s_start, s_end;
-            int m;
-            int shift;
-            png_uint_32 i;
-
-            sp = png_ptr->row_buf + 1;
-            dp = row;
-            m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (png_ptr->transformations & PNG_PACKSWAP)
-            {
-               s_start = 0;
-               s_end = 7;
-               s_inc = 1;
-            }
-            else
-#endif
-            {
-               s_start = 7;
-               s_end = 0;
-               s_inc = -1;
-            }
-
-            shift = s_start;
-
-            for (i = 0; i < png_ptr->width; i++)
-            {
-               if (m & mask)
-               {
-                  int value;
-
-                  value = (*sp >> shift) & 0x1;
-                  *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
-                  *dp |= (png_byte)(value << shift);
-               }
-
-               if (shift == s_end)
-               {
-                  shift = s_start;
-                  sp++;
-                  dp++;
-               }
-               else
-                  shift += s_inc;
-
-               if (m == 1)
-                  m = 0x80;
-               else
-                  m >>= 1;
-            }
-            break;
-         }       /* end 1 bpp */
-
-         case 2:        /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep sp;
-            png_bytep dp;
-            int s_start, s_end, s_inc;
-            int m;
-            int shift;
-            png_uint_32 i;
-            int value;
-
-            sp = png_ptr->row_buf + 1;
-            dp = row;
-            m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (png_ptr->transformations & PNG_PACKSWAP)
-            {
-               s_start = 0;
-               s_end = 6;
-               s_inc = 2;
-            }
-            else
-#endif
-            {
-               s_start = 6;
-               s_end = 0;
-               s_inc = -2;
-            }
-
-            shift = s_start;
-
-            for (i = 0; i < png_ptr->width; i++)
-            {
-               if (m & mask)
-               {
-                  value = (*sp >> shift) & 0x3;
-                  *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
-                  *dp |= (png_byte)(value << shift);
-               }
-
-               if (shift == s_end)
-               {
-                  shift = s_start;
-                  sp++;
-                  dp++;
-               }
-               else
-                  shift += s_inc;
-               if (m == 1)
-                  m = 0x80;
-               else
-                  m >>= 1;
-            }
-            break;
-         }       /* end 2 bpp */
-
-         case 4:        /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep sp;
-            png_bytep dp;
-            int s_start, s_end, s_inc;
-            int m;
-            int shift;
-            png_uint_32 i;
-            int value;
-
-            sp = png_ptr->row_buf + 1;
-            dp = row;
-            m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (png_ptr->transformations & PNG_PACKSWAP)
-            {
-               s_start = 0;
-               s_end = 4;
-               s_inc = 4;
-            }
-            else
-#endif
-            {
-               s_start = 4;
-               s_end = 0;
-               s_inc = -4;
-            }
-
-            shift = s_start;
-
-            for (i = 0; i < png_ptr->width; i++)
-            {
-               if (m & mask)
-               {
-                  value = (*sp >> shift) & 0xf;
-                  *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
-                  *dp |= (png_byte)(value << shift);
-               }
-
-               if (shift == s_end)
-               {
-                  shift = s_start;
-                  sp++;
-                  dp++;
-               }
-               else
-                  shift += s_inc;
-               if (m == 1)
-                  m = 0x80;
-               else
-                  m >>= 1;
-            }
-            break;
-         }       /* end 4 bpp */
-
-         case 16:       /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-
-#if !defined(PNG_1_0_X)
-            if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
-#else
-            if (_mmx_supported)
-#endif
-            {
-               png_uint_32 len;
-               int diff;
-
-               srcptr = png_ptr->row_buf + 1;
-               dstptr = row;
-               len  = png_ptr->width & ~7;          // reduce to multiple of 8
-               diff = (int) (png_ptr->width & 7);   // amount lost
-
-               __asm__ __volatile__ (
-                  "not       %%edx            \n\t" // mask => unmask
-                  "movd      %%edx, %%mm7     \n\t" // load bit pattern
-                  "not       %%edx            \n\t" // unmask => mask for later
-                  "psubb     %%mm6, %%mm6     \n\t" // zero mm6
-                  "punpcklbw %%mm7, %%mm7     \n\t"
-                  "punpcklwd %%mm7, %%mm7     \n\t"
-                  "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
-
-                  LOAD_GOT_rbp
-                  "movq   " MASK16_0 ", %%mm0 \n\t" // _mask16_0 -> mm0
-                  "movq   " MASK16_1 ", %%mm1 \n\t" // _mask16_1 -> mm1
-                  RESTORE_rbp
-
-                  "pand      %%mm7, %%mm0     \n\t"
-                  "pand      %%mm7, %%mm1     \n\t"
-
-                  "pcmpeqb   %%mm6, %%mm0     \n\t"
-                  "pcmpeqb   %%mm6, %%mm1     \n\t"
-
-// preload        "movl      len, %%ecx       \n\t" // load length of line
-// preload        "movl      srcptr, %3       \n\t" // load source
-// preload        "movl      dstptr, %4       \n\t" // load dest
-
-                  "cmpl      $0, %%ecx        \n\t"
-                  "jz        mainloop16end    \n\t"
-
-                "mainloop16:                  \n\t"
-                  "movq      (%3), %%mm4      \n\t"
-                  "pand      %%mm0, %%mm4     \n\t"
-                  "movq      %%mm0, %%mm6     \n\t"
-                  "movq      (%4), %%mm7      \n\t"
-                  "pandn     %%mm7, %%mm6     \n\t"
-                  "por       %%mm6, %%mm4     \n\t"
-                  "movq      %%mm4, (%4)      \n\t"
-
-                  "movq      8(%3), %%mm5     \n\t"
-                  "pand      %%mm1, %%mm5     \n\t"
-                  "movq      %%mm1, %%mm7     \n\t"
-                  "movq      8(%4), %%mm6     \n\t"
-                  "pandn     %%mm6, %%mm7     \n\t"
-                  "por       %%mm7, %%mm5     \n\t"
-                  "movq      %%mm5, 8(%4)     \n\t"
-
-                  "add       $16, %3          \n\t" // inc by 16 bytes processed
-                  "add       $16, %4          \n\t"
-                  "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
-                  "ja        mainloop16       \n\t"
-
-                "mainloop16end:               \n\t"
-// preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
-                  "movl      %%eax, %%ecx     \n\t"
-                  "cmpl      $0, %%ecx        \n\t"
-                  "jz        end16            \n\t"
-// preload        "movl      mask, %%edx      \n\t"
-                  "sall      $24, %%edx       \n\t" // make low byte, high byte
-
-                "secondloop16:                \n\t"
-                  "sall      %%edx            \n\t" // move high bit to CF
-                  "jnc       skip16           \n\t" // if CF = 0
-                  "movw      (%3), %%ax       \n\t"
-                  "movw      %%ax, (%4)       \n\t"
-
-                "skip16:                      \n\t"
-                  "add       $2, %3           \n\t"
-                  "add       $2, %4           \n\t"
-                  "decl      %%ecx            \n\t"
-                  "jnz       secondloop16     \n\t"
-
-                "end16:                       \n\t"
-                  "EMMS                       \n\t" // DONE
-
-                  : "=a" (dummy_value_a),           // output regs (dummy)
-                    "=d" (dummy_value_d),
-                    "=c" (dummy_value_c),
-                    "=S" (dummy_value_S),
-                    "=D" (dummy_value_D)
-
-                  : "0" (diff),        // eax       // input regs
-                    "1" (mask),        // edx
-                    "2" (len),         // ecx
-// was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
-                    "3" (srcptr),      // esi/rsi
-                    "4" (dstptr)       // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                  : "%mm0", "%mm1", "%mm4"          // clobber list
-                  , "%mm5", "%mm6", "%mm7"
-#endif
-               );
-            }
-            else /* not _mmx_supported - use modified C routine */
-            {
-               register png_uint_32 i;
-               png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
-                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-               register int stride = BPP2 * png_pass_inc[png_ptr->pass];
-                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-               register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
-                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
-               int diff = (int) (png_ptr->width & 7); /* amount lost */
-               register png_uint_32 final_val = BPP2 * len;   /* GRR bugfix */
-
-               srcptr = png_ptr->row_buf + 1 + initial_val;
-               dstptr = row + initial_val;
-
-               for (i = initial_val; i < final_val; i += stride)
-               {
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-               if (diff)  /* number of leftover pixels:  3 for pngtest */
-               {
-                  final_val += diff*BPP2;
-                  for (; i < final_val; i += stride)
-                  {
-                     if (rep_bytes > (int)(final_val-i))
-                        rep_bytes = (int)(final_val-i);
-                     png_memcpy(dstptr, srcptr, rep_bytes);
-                     srcptr += stride;
-                     dstptr += stride;
-                  }
-               }
-            } /* end of else (_mmx_supported) */
-
-            break;
-         }       /* end 16 bpp */
-
-         case 48:       /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-
-#if !defined(PNG_1_0_X)
-            if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
-#else
-            if (_mmx_supported)
-#endif
-            {
-               png_uint_32 len;
-               int diff;
-
-               srcptr = png_ptr->row_buf + 1;
-               dstptr = row;
-               len  = png_ptr->width & ~7;          // reduce to multiple of 8
-               diff = (int) (png_ptr->width & 7);   // amount lost
-
-               __asm__ __volatile__ (
-                  "not       %%edx            \n\t" // mask => unmask
-                  "movd      %%edx, %%mm7     \n\t" // load bit pattern
-                  "not       %%edx            \n\t" // unmask => mask for later
-                  "psubb     %%mm6, %%mm6     \n\t" // zero mm6
-                  "punpcklbw %%mm7, %%mm7     \n\t"
-                  "punpcklwd %%mm7, %%mm7     \n\t"
-                  "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
-
-                  LOAD_GOT_rbp
-                  "movq   " MASK48_0 ", %%mm0 \n\t" // _mask48_0 -> mm0
-                  "movq   " MASK48_1 ", %%mm1 \n\t" // _mask48_1 -> mm1
-                  "movq   " MASK48_2 ", %%mm2 \n\t" // _mask48_2 -> mm2
-                  "movq   " MASK48_3 ", %%mm3 \n\t" // _mask48_3 -> mm3
-                  "movq   " MASK48_4 ", %%mm4 \n\t" // _mask48_4 -> mm4
-                  "movq   " MASK48_5 ", %%mm5 \n\t" // _mask48_5 -> mm5
-                  RESTORE_rbp
-
-                  "pand      %%mm7, %%mm0     \n\t"
-                  "pand      %%mm7, %%mm1     \n\t"
-                  "pand      %%mm7, %%mm2     \n\t"
-                  "pand      %%mm7, %%mm3     \n\t"
-                  "pand      %%mm7, %%mm4     \n\t"
-                  "pand      %%mm7, %%mm5     \n\t"
-
-                  "pcmpeqb   %%mm6, %%mm0     \n\t"
-                  "pcmpeqb   %%mm6, %%mm1     \n\t"
-                  "pcmpeqb   %%mm6, %%mm2     \n\t"
-                  "pcmpeqb   %%mm6, %%mm3     \n\t"
-                  "pcmpeqb   %%mm6, %%mm4     \n\t"
-                  "pcmpeqb   %%mm6, %%mm5     \n\t"
-
-// preload        "movl      len, %%ecx       \n\t" // load length of line
-// preload        "movl      srcptr, %3       \n\t" // load source
-// preload        "movl      dstptr, %4       \n\t" // load dest
-
-                  "cmpl      $0, %%ecx        \n\t"
-                  "jz        mainloop48end    \n\t"
-
-                "mainloop48:                  \n\t"
-                  "movq      (%3), %%mm7      \n\t"
-                  "pand      %%mm0, %%mm7     \n\t"
-                  "movq      %%mm0, %%mm6     \n\t"
-                  "pandn     (%4), %%mm6      \n\t"
-                  "por       %%mm6, %%mm7     \n\t"
-                  "movq      %%mm7, (%4)      \n\t"
-
-                  "movq      8(%3), %%mm6     \n\t"
-                  "pand      %%mm1, %%mm6     \n\t"
-                  "movq      %%mm1, %%mm7     \n\t"
-                  "pandn     8(%4), %%mm7     \n\t"
-                  "por       %%mm7, %%mm6     \n\t"
-                  "movq      %%mm6, 8(%4)     \n\t"
-
-                  "movq      16(%3), %%mm6    \n\t"
-                  "pand      %%mm2, %%mm6     \n\t"
-                  "movq      %%mm2, %%mm7     \n\t"
-                  "pandn     16(%4), %%mm7    \n\t"
-                  "por       %%mm7, %%mm6     \n\t"
-                  "movq      %%mm6, 16(%4)    \n\t"
-
-                  "movq      24(%3), %%mm7    \n\t"
-                  "pand      %%mm3, %%mm7     \n\t"
-                  "movq      %%mm3, %%mm6     \n\t"
-                  "pandn     24(%4), %%mm6    \n\t"
-                  "por       %%mm6, %%mm7     \n\t"
-                  "movq      %%mm7, 24(%4)    \n\t"
-
-                  "movq      32(%3), %%mm6    \n\t"
-                  "pand      %%mm4, %%mm6     \n\t"
-                  "movq      %%mm4, %%mm7     \n\t"
-                  "pandn     32(%4), %%mm7    \n\t"
-                  "por       %%mm7, %%mm6     \n\t"
-                  "movq      %%mm6, 32(%4)    \n\t"
-
-                  "movq      40(%3), %%mm7    \n\t"
-                  "pand      %%mm5, %%mm7     \n\t"
-                  "movq      %%mm5, %%mm6     \n\t"
-                  "pandn     40(%4), %%mm6    \n\t"
-                  "por       %%mm6, %%mm7     \n\t"
-                  "movq      %%mm7, 40(%4)    \n\t"
-
-                  "add       $48, %3          \n\t" // inc by 48 bytes processed
-                  "add       $48, %4          \n\t"
-                  "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
-
-                  "ja        mainloop48       \n\t"
-
-                "mainloop48end:               \n\t"
-// preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
-                  "movl      %%eax, %%ecx     \n\t"
-                  "cmpl      $0, %%ecx        \n\t"
-                  "jz        end48            \n\t"
-// preload        "movl      mask, %%edx      \n\t"
-                  "sall      $24, %%edx       \n\t" // make low byte, high byte
-
-                "secondloop48:                \n\t"
-                  "sall      %%edx            \n\t" // move high bit to CF
-                  "jnc       skip48           \n\t" // if CF = 0
-                  "movl      (%3), %%eax      \n\t"
-                  "movl      %%eax, (%4)      \n\t"
-                  "movw      4(%3), %%ax      \n\t" // GR-P bugfix 20070717
-                  "movw      %%ax, 4(%4)      \n\t" // GR-P bugfix 20070717
-
-                "skip48:                      \n\t"
-                  "add       $6, %3           \n\t" // GR-P bugfix 20070717
-                  "add       $6, %4           \n\t" // GR-P bugfix 20070717
-                  "decl      %%ecx            \n\t"
-                  "jnz       secondloop48     \n\t"
-
-                "end48:                       \n\t"
-                  "EMMS                       \n\t" // DONE
-
-                  : "=a" (dummy_value_a),           // output regs (dummy)
-                    "=d" (dummy_value_d),
-                    "=c" (dummy_value_c),
-                    "=S" (dummy_value_S),
-                    "=D" (dummy_value_D)
-
-                  : "0" (diff),        // eax       // input regs
-                    "1" (mask),        // edx
-                    "2" (len),         // ecx
-// was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
-                    "3" (srcptr),      // esi/rsi
-                    "4" (dstptr)       // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                  : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
-                  , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-               );
-            }
-            else /* not _mmx_supported - use modified C routine */
-            {
-               register png_uint_32 i;
-               png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
-                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-               register int stride = BPP6 * png_pass_inc[png_ptr->pass];
-                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-               register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
-                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
-               int diff = (int) (png_ptr->width & 7); /* amount lost */
-               register png_uint_32 final_val = BPP6 * len;   /* GRR bugfix */
-
-               srcptr = png_ptr->row_buf + 1 + initial_val;
-               dstptr = row + initial_val;
-
-               for (i = initial_val; i < final_val; i += stride)
-               {
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-               if (diff)  /* number of leftover pixels:  3 for pngtest */
-               {
-                  final_val += diff*BPP6;
-                  for (; i < final_val; i += stride)
-                  {
-                     if (rep_bytes > (int)(final_val-i))
-                        rep_bytes = (int)(final_val-i);
-                     png_memcpy(dstptr, srcptr, rep_bytes);
-                     srcptr += stride;
-                     dstptr += stride;
-                  }
-               }
-            } /* end of else (_mmx_supported) */
-
-            break;
-         }       /* end 48 bpp */
-
-         case 64:       /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-            register png_uint_32 i;
-            png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
-              /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-            register int stride = BPP8 * png_pass_inc[png_ptr->pass];
-              /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-            register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
-              /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-            png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
-            int diff = (int) (png_ptr->width & 7); /* amount lost */
-            register png_uint_32 final_val = BPP8 * len;   /* GRR bugfix */
-
-            srcptr = png_ptr->row_buf + 1 + initial_val;
-            dstptr = row + initial_val;
-
-            for (i = initial_val; i < final_val; i += stride)
-            {
-               png_memcpy(dstptr, srcptr, rep_bytes);
-               srcptr += stride;
-               dstptr += stride;
-            }
-            if (diff)  /* number of leftover pixels:  3 for pngtest */
-            {
-               final_val += diff*BPP8;
-               for (; i < final_val; i += stride)
-               {
-                  if (rep_bytes > (int)(final_val-i))
-                     rep_bytes = (int)(final_val-i);
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-            }
-
-            break;
-         }       /* end 64 bpp */
-
-         default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
-         {
-            // ERROR:  SHOULD NEVER BE REACHED
-#if defined(PNG_DEBUG)
-            png_debug(1, "Internal libpng logic error (GCC "
-              "png_combine_row() pixel_depth)\n");
-#endif
-            break;
-         }
-      } /* end switch (png_ptr->row_info.pixel_depth) */
-
-   } /* end if (non-trivial mask) */
-
-} /* end png_combine_row() */
-
-#endif /* PNG_HAVE_MMX_COMBINE_ROW */
-
-
-
-
-/*===========================================================================*/
-/*                                                                           */
-/*                 P N G _ D O _ R E A D _ I N T E R L A C E                 */
-/*                                                                           */
-/*===========================================================================*/
-
-#if defined(PNG_READ_INTERLACING_SUPPORTED)
-#if defined(PNG_HAVE_MMX_READ_INTERLACE)
-
-/* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
- * has taken place.  [GRR: what other steps come before and/or after?]
- */
-
-void /* PRIVATE */
-png_do_read_interlace(png_structp png_ptr)
-{
-   png_row_infop row_info = &(png_ptr->row_info);
-   png_bytep row = png_ptr->row_buf + 1;
-   int pass = png_ptr->pass;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-   png_uint_32 transformations = png_ptr->transformations;
-#endif
-
-   png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
-
-   if (_mmx_supported == 2) {
-#if !defined(PNG_1_0_X)
-       /* this should have happened in png_init_mmx_flags() already */
-       png_warning(png_ptr, "asm_flags may not have been initialized");
-#endif
-       png_mmx_support();
-   }
-
-   if (row != NULL && row_info != NULL)
-   {
-      png_uint_32 final_width;
-
-      final_width = row_info->width * png_pass_inc[pass];
-
-      switch (row_info->pixel_depth)
-      {
-         case 1:
-         {
-            png_bytep sp, dp;
-            int sshift, dshift;
-            int s_start, s_end, s_inc;
-            png_byte v;
-            png_uint_32 i;
-            int j;
-
-            sp = row + (png_size_t)((row_info->width - 1) >> 3);
-            dp = row + (png_size_t)((final_width - 1) >> 3);
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (transformations & PNG_PACKSWAP)
-            {
-               sshift = (int)((row_info->width + 7) & 7);
-               dshift = (int)((final_width + 7) & 7);
-               s_start = 7;
-               s_end = 0;
-               s_inc = -1;
-            }
-            else
-#endif
-            {
-               sshift = 7 - (int)((row_info->width + 7) & 7);
-               dshift = 7 - (int)((final_width + 7) & 7);
-               s_start = 0;
-               s_end = 7;
-               s_inc = 1;
-            }
-
-            for (i = row_info->width; i; i--)
-            {
-               v = (png_byte)((*sp >> sshift) & 0x1);
-               for (j = 0; j < png_pass_inc[pass]; j++)
-               {
-                  *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
-                  *dp |= (png_byte)(v << dshift);
-                  if (dshift == s_end)
-                  {
-                     dshift = s_start;
-                     dp--;
-                  }
-                  else
-                     dshift += s_inc;
-               }
-               if (sshift == s_end)
-               {
-                  sshift = s_start;
-                  sp--;
-               }
-               else
-                  sshift += s_inc;
-            }
-            break;
-         }
-
-         case 2:
-         {
-            png_bytep sp, dp;
-            int sshift, dshift;
-            int s_start, s_end, s_inc;
-            png_uint_32 i;
-
-            sp = row + (png_size_t)((row_info->width - 1) >> 2);
-            dp = row + (png_size_t)((final_width - 1) >> 2);
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (transformations & PNG_PACKSWAP)
-            {
-               sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
-               dshift = (png_size_t)(((final_width + 3) & 3) << 1);
-               s_start = 6;
-               s_end = 0;
-               s_inc = -2;
-            }
-            else
-#endif
-            {
-               sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
-               dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
-               s_start = 0;
-               s_end = 6;
-               s_inc = 2;
-            }
-
-            for (i = row_info->width; i; i--)
-            {
-               png_byte v;
-               int j;
-
-               v = (png_byte)((*sp >> sshift) & 0x3);
-               for (j = 0; j < png_pass_inc[pass]; j++)
-               {
-                  *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
-                  *dp |= (png_byte)(v << dshift);
-                  if (dshift == s_end)
-                  {
-                     dshift = s_start;
-                     dp--;
-                  }
-                  else
-                     dshift += s_inc;
-               }
-               if (sshift == s_end)
-               {
-                  sshift = s_start;
-                  sp--;
-               }
-               else
-                  sshift += s_inc;
-            }
-            break;
-         }
-
-         case 4:
-         {
-            png_bytep sp, dp;
-            int sshift, dshift;
-            int s_start, s_end, s_inc;
-            png_uint_32 i;
-
-            sp = row + (png_size_t)((row_info->width - 1) >> 1);
-            dp = row + (png_size_t)((final_width - 1) >> 1);
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (transformations & PNG_PACKSWAP)
-            {
-               sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
-               dshift = (png_size_t)(((final_width + 1) & 1) << 2);
-               s_start = 4;
-               s_end = 0;
-               s_inc = -4;
-            }
-            else
-#endif
-            {
-               sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
-               dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
-               s_start = 0;
-               s_end = 4;
-               s_inc = 4;
-            }
-
-            for (i = row_info->width; i; i--)
-            {
-               png_byte v;
-               int j;
-
-               v = (png_byte)((*sp >> sshift) & 0xf);
-               for (j = 0; j < png_pass_inc[pass]; j++)
-               {
-                  *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
-                  *dp |= (png_byte)(v << dshift);
-                  if (dshift == s_end)
-                  {
-                     dshift = s_start;
-                     dp--;
-                  }
-                  else
-                     dshift += s_inc;
-               }
-               if (sshift == s_end)
-               {
-                  sshift = s_start;
-                  sp--;
-               }
-               else
-                  sshift += s_inc;
-            }
-            break;
-         }
-
-       /*====================================================================*/
-
-         default: /* 8-bit or larger (this is where the routine is modified) */
-         {
-            png_bytep sptr, dp;
-            png_uint_32 i;
-            png_size_t pixel_bytes;
-            int width = (int)row_info->width;
-
-            pixel_bytes = (row_info->pixel_depth >> 3);
-
-            /* point sptr at the last pixel in the pre-expanded row: */
-            sptr = row + (width - 1) * pixel_bytes;
-
-            /* point dp at the last pixel position in the expanded row: */
-            dp = row + (final_width - 1) * pixel_bytes;
-
-            /* New code by Nirav Chhatrapati - Intel Corporation */
-
-#if !defined(PNG_1_0_X)
-            if (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
-#else
-            if (_mmx_supported)
-#endif
-            {
-               int dummy_value_c;        // fix 'forbidden register spilled'
-               png_bytep dummy_value_S;
-               png_bytep dummy_value_D;
-               png_bytep dummy_value_a;
-               png_bytep dummy_value_d;
-
-               //--------------------------------------------------------------
-               if (pixel_bytes == BPP3)
-               {
-                  if (((pass == 4) || (pass == 5)) && width)
-                  {
-                     int width_mmx = ((width >> 1) << 1) - 8;   // GRR:  huh?
-                     if (width_mmx < 0)
-                         width_mmx = 0;
-                     width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
-                     if (width_mmx)
-                     {
-                        // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
-                        // sptr points at last pixel in pre-expanded row
-                        // dp points at last pixel position in expanded row
-                        __asm__ __volatile__ (
-                           "sub  $3, %1             \n\t"
-                           "sub  $9, %2             \n\t"
-                                        // (png_pass_inc[pass] + 1)*pixel_bytes
-
-                        ".loop3_pass4:              \n\t"
-                           "movq (%1), %%mm0        \n\t" // x x 5 4 3 2 1 0
-                           "movq %%mm0, %%mm1       \n\t" // x x 5 4 3 2 1 0
-                           "movq %%mm0, %%mm2       \n\t" // x x 5 4 3 2 1 0
-                           "psllq $24, %%mm0        \n\t" // 4 3 2 1 0 z z z
-                           "pand (%3), %%mm1        \n\t" // z z z z z 2 1 0
-                           "psrlq $24, %%mm2        \n\t" // z z z x x 5 4 3
-                           "por %%mm1, %%mm0        \n\t" // 4 3 2 1 0 2 1 0
-                           "movq %%mm2, %%mm3       \n\t" // z z z x x 5 4 3
-                           "psllq $8, %%mm2         \n\t" // z z x x 5 4 3 z
-                           "movq %%mm0, (%2)        \n\t"
-                           "psrlq $16, %%mm3        \n\t" // z z z z z x x 5
-                           "pand (%4), %%mm3        \n\t" // z z z z z z z 5
-                           "por %%mm3, %%mm2        \n\t" // z z x x 5 4 3 5
-                           "sub  $6, %1             \n\t"
-                           "movd %%mm2, 8(%2)       \n\t"
-                           "sub  $12, %2            \n\t"
-                           "subl $2, %%ecx          \n\t"
-                           "jnz .loop3_pass4        \n\t"
-                           "EMMS                    \n\t" // DONE
-
-                           : "=c" (dummy_value_c),        // output regs (dummy)
-                             "=S" (dummy_value_S),
-                             "=D" (dummy_value_D),
-                             "=a" (dummy_value_a),
-                             "=d" (dummy_value_d)
-
-                           : "0" (width_mmx),     // ecx  // input regs
-                             "1" (sptr),          // esi/rsi
-                             "2" (dp),            // edi/rdi
-#if defined(PNG_x86_64_USE_GOTPCREL)     // formerly _const4 and _const6:
-                             "3" (&_c64._amask5_3_0), // (0x0000000000FFFFFFLL)
-                             "4" (&_c64._amask7_1_0)  // (0x00000000000000FFLL)
-#else
-                             "3" (&_amask5_3_0),  // eax (0x0000000000FFFFFFLL)
-                             "4" (&_amask7_1_0)   // edx (0x00000000000000FFLL)
-#endif
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                           : "%mm0", "%mm1"               // clobber list
-                           , "%mm2", "%mm3"
-#endif
-                        );
-                     }
-
-                     sptr -= width_mmx*BPP3;
-                     dp -= width_mmx*2*BPP3;
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-
-                        png_memcpy(v, sptr, BPP3);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           png_memcpy(dp, v, BPP3);
-                           dp -= BPP3;
-                        }
-                        sptr -= BPP3;
-                     }
-                  }
-                  else if (((pass == 2) || (pass == 3)) && width)
-                  {
-                     __asm__ __volatile__ (
-                        "sub  $9, %2             \n\t"
-                                     // (png_pass_inc[pass] - 1)*pixel_bytes
-
-                     ".loop3_pass2:              \n\t"
-                        "movd (%1), %%mm0        \n\t" // x x x x x 2 1 0
-                        "pand (%3), %%mm0        \n\t" // z z z z z 2 1 0
-                        "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
-                        "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
-                        "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
-                        "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
-                        "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
-                        "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
-                        "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
-                        "movq %%mm0, 4(%2)       \n\t"
-                        "psrlq $16, %%mm0        \n\t" // z z 2 1 0 2 1 0
-                        "sub  $3, %1             \n\t"
-                        "movd %%mm0, (%2)        \n\t"
-                        "sub  $12, %2            \n\t"
-                        "decl %%ecx              \n\t"
-                        "jnz .loop3_pass2        \n\t"
-                        "EMMS                    \n\t" // DONE
-
-                        : "=c" (dummy_value_c),        // output regs (dummy)
-                          "=S" (dummy_value_S),
-                          "=D" (dummy_value_D),
-                          "=a" (dummy_value_a)
-
-                        : "0" (width),         // ecx  // input regs
-                          "1" (sptr),          // esi/rsi
-                          "2" (dp),            // edi/rdi
-#if defined(PNG_x86_64_USE_GOTPCREL)           // formerly _const4:
-                          "3" (&_c64._amask5_3_0)  // (0x0000000000FFFFFFLL)
-#else
-                          "3" (&_amask5_3_0)   // eax (0x0000000000FFFFFFLL)
-#endif
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-                        : "%mm0", "%mm1", "%mm2"       // clobber list
-#endif
-                     );
-                  }
-                  else if (width)  // && ((pass == 0) || (pass == 1))
-                  {
-                     __asm__ __volatile__ (
-                        "sub  $21, %2            \n\t"
-                                     // (png_pass_inc[pass] - 1)*pixel_bytes
-
-                     ".loop3_pass0:              \n\t"
-                        "movd (%1), %%mm0        \n\t" // x x x x x 2 1 0
-                        "pand (%3), %%mm0        \n\t" // z z z z z 2 1 0
-                        "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
-                        "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
-                        "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
-                        "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
-                        "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
-                        "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
-                        "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
-                        "movq %%mm0, %%mm3       \n\t" // 2 1 0 2 1 0 2 1
-                        "psllq $16, %%mm0        \n\t" // 0 2 1 0 2 1 z z
-                        "movq %%mm3, %%mm4       \n\t" // 2 1 0 2 1 0 2 1
-                        "punpckhdq %%mm0, %%mm3  \n\t" // 0 2 1 0 2 1 0 2
-                        "movq %%mm4, 16(%2)      \n\t"
-                        "psrlq $32, %%mm0        \n\t" // z z z z 0 2 1 0
-                        "movq %%mm3, 8(%2)       \n\t"
-                        "punpckldq %%mm4, %%mm0  \n\t" // 1 0 2 1 0 2 1 0
-                        "sub  $3, %1             \n\t"
-                        "movq %%mm0, (%2)        \n\t"
-                        "sub  $24, %2            \n\t"
-                        "decl %%ecx              \n\t"
-                        "jnz .loop3_pass0        \n\t"
-                        "EMMS                    \n\t" // DONE
-
-                        : "=c" (dummy_value_c),        // output regs (dummy)
-                          "=S" (dummy_value_S),
-                          "=D" (dummy_value_D),
-                          "=a" (dummy_value_a)
-
-                        : "0" (width),         // ecx  // input regs
-                          "1" (sptr),          // esi/rsi
-                          "2" (dp),            // edi/rdi
-#if defined(PNG_x86_64_USE_GOTPCREL)           // formerly _const4:
-                          "3" (&_c64._amask5_3_0)  // (0x0000000000FFFFFFLL)
-#else
-                          "3" (&_amask5_3_0)   // eax (0x0000000000FFFFFFLL)
-#endif
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                        : "%mm0", "%mm1", "%mm2"       // clobber list
-                        , "%mm3", "%mm4"
-#endif
-                     );
-                  }
-               } /* end of pixel_bytes == 3 */
-
-               //--------------------------------------------------------------
-               else if (pixel_bytes == BPP4)
-               {
-                  if (((pass == 4) || (pass == 5)) && width)
-                  {
-                     int width_mmx = ((width >> 1) << 1) ;
-                     width -= width_mmx;        // 0,1 pixels => 0,4 bytes
-                     if (width_mmx)
-                     {
-                        __asm__ __volatile__ (
-                           "sub  $4, %1             \n\t"
-                           "sub  $12, %2            \n\t"
-
-                        ".loop4_pass4:              \n\t"
-                           "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
-                           "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
-                           "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
-                           "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
-                           "movq %%mm0, (%2)        \n\t"
-                           "sub  $8, %1             \n\t"
-                           "movq %%mm1, 8(%2)       \n\t"
-                           "sub  $16, %2            \n\t"
-                           "subl $2, %%ecx          \n\t"
-                           "jnz .loop4_pass4        \n\t"
-                           "EMMS                    \n\t" // DONE
-
-                           : "=c" (dummy_value_c),        // output regs (dummy)
-                             "=S" (dummy_value_S),
-                             "=D" (dummy_value_D)
-
-                           : "0" (width_mmx),     // ecx  // input regs
-                             "1" (sptr),          // esi/rsi
-                             "2" (dp)             // edi/rdi
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-                           : "%mm0", "%mm1"               // clobber list
-#endif
-                        );
-                     }
-
-                     sptr -= (width_mmx*BPP4 - BPP4); // sign fixed
-                     dp -= (width_mmx*2*BPP4 - BPP4); // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= BPP4;
-                        png_memcpy(v, sptr, BPP4);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= BPP4;
-                           png_memcpy(dp, v, BPP4);
-                        }
-                     }
-                  }
-                  else if (((pass == 2) || (pass == 3)) && width)
-                  {
-                     int width_mmx = ((width >> 1) << 1);
-                     width -= width_mmx;        // 0,1 pixels => 0,4 bytes
-                     if (width_mmx)
-                     {
-                        __asm__ __volatile__ (
-                           "sub  $4, %1             \n\t"
-                           "sub  $28, %2            \n\t"
-
-                        ".loop4_pass2:              \n\t"
-                           "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
-                           "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
-                           "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
-                           "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
-                           "movq %%mm0, (%2)        \n\t"
-                           "movq %%mm0, 8(%2)       \n\t"
-                           "movq %%mm1, 16(%2)      \n\t"
-                           "movq %%mm1, 24(%2)      \n\t"
-                           "sub  $8, %1             \n\t"
-                           "sub  $32, %2            \n\t"
-                           "subl $2, %%ecx          \n\t"
-                           "jnz .loop4_pass2        \n\t"
-                           "EMMS                    \n\t" // DONE
-
-                           : "=c" (dummy_value_c),        // output regs (dummy)
-                             "=S" (dummy_value_S),
-                             "=D" (dummy_value_D)
-
-                           : "0" (width_mmx),     // ecx  // input regs
-                             "1" (sptr),          // esi/rsi
-                             "2" (dp)             // edi/rdi
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-                           : "%mm0", "%mm1"               // clobber list
-#endif
-                        );
-                     }
-
-                     sptr -= (width_mmx*4 - 4); // sign fixed
-                     dp -= (width_mmx*16 - 4);  // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= 4;
-                        png_memcpy(v, sptr, 4);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= 4;
-                           png_memcpy(dp, v, 4);
-                        }
-                     }
-                  }
-                  else if (width)  // && ((pass == 0) || (pass == 1))
-                  {
-                     int width_mmx = ((width >> 1) << 1);
-                     width -= width_mmx;        // 0,1 pixels => 0,4 bytes
-                     if (width_mmx)
-                     {
-                        __asm__ __volatile__ (
-                           "sub  $4, %1             \n\t"
-                           "sub  $60, %2            \n\t"
-
-                        ".loop4_pass0:              \n\t"
-                           "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
-                           "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
-                           "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
-                           "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
-                           "movq %%mm0, (%2)        \n\t"
-                           "movq %%mm0, 8(%2)       \n\t"
-                           "movq %%mm0, 16(%2)      \n\t"
-                           "movq %%mm0, 24(%2)      \n\t"
-                           "movq %%mm1, 32(%2)      \n\t"
-                           "movq %%mm1, 40(%2)      \n\t"
-                           "movq %%mm1, 48(%2)      \n\t"
-                           "sub  $8, %1             \n\t"
-                           "movq %%mm1, 56(%2)      \n\t"
-                           "sub  $64, %2            \n\t"
-                           "subl $2, %%ecx          \n\t"
-                           "jnz .loop4_pass0        \n\t"
-                           "EMMS                    \n\t" // DONE
-
-                           : "=c" (dummy_value_c),        // output regs (dummy)
-                             "=S" (dummy_value_S),
-                             "=D" (dummy_value_D)
-
-                           : "0" (width_mmx),     // ecx  // input regs
-                             "1" (sptr),          // esi/rsi
-                             "2" (dp)             // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                           : "%mm0", "%mm1"               // clobber list
-#endif
-                        );
-                     }
-
-                     sptr -= (width_mmx*4 - 4); // sign fixed
-                     dp -= (width_mmx*32 - 4);  // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= 4;
-                        png_memcpy(v, sptr, 4);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= 4;
-                           png_memcpy(dp, v, 4);
-                        }
-                     }
-                  }
-               } /* end of pixel_bytes == 4 */
-
-               //--------------------------------------------------------------
-               else if (pixel_bytes == 1)
-               {
-                  if (((pass == 4) || (pass == 5)) && width)
-                  {
-                     int width_mmx = ((width >> 3) << 3);
-                     width -= width_mmx;        // 0-3 pixels => 0-3 bytes
-                     if (width_mmx)
-                     {
-                        __asm__ __volatile__ (
-                           "sub  $7, %1             \n\t"
-                           "sub  $15, %2            \n\t"
-
-                        ".loop1_pass4:              \n\t"
-                           "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
-                           "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
-                           "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
-                           "punpckhbw %%mm1, %%mm1  \n\t" // 7 7 6 6 5 5 4 4
-                           "movq %%mm1, 8(%2)       \n\t"
-                           "sub  $8, %1             \n\t"
-                           "movq %%mm0, (%2)        \n\t"
-                           "sub  $16, %2            \n\t"
-                           "subl $8, %%ecx          \n\t"
-                           "jnz .loop1_pass4        \n\t"
-                           "EMMS                    \n\t" // DONE
-
-                           : "=c" (dummy_value_c),        // output regs (dummy)
-                             "=S" (dummy_value_S),
-                             "=D" (dummy_value_D)
-
-                           : "0" (width_mmx),     // ecx  // input regs
-                             "1" (sptr),          // esi/rsi
-                             "2" (dp)             // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                           : "%mm0", "%mm1"               // clobber list
-#endif
-                        );
-                     }
-
-                     sptr -= width_mmx;
-                     dp -= width_mmx*2;
-                     for (i = width; i; i--)
-                     {
-                        int j;
-
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           *dp-- = *sptr;
-                        }
-                        --sptr;
-                     }
-                  }
-                  else if (((pass == 2) || (pass == 3)) && width)
-                  {
-                     int width_mmx = ((width >> 2) << 2);
-                     width -= width_mmx;        // 0-3 pixels => 0-3 bytes
-                     if (width_mmx)
-                     {
-                        __asm__ __volatile__ (
-                           "sub  $3, %1             \n\t"
-                           "sub  $15, %2            \n\t"
-
-                        ".loop1_pass2:              \n\t"
-                           "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
-                           "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
-                           "movq %%mm0, %%mm1       \n\t" // 3 3 2 2 1 1 0 0
-                           "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
-                           "punpckhwd %%mm1, %%mm1  \n\t" // 3 3 3 3 2 2 2 2
-                           "movq %%mm0, (%2)        \n\t"
-                           "sub  $4, %1             \n\t"
-                           "movq %%mm1, 8(%2)       \n\t"
-                           "sub  $16, %2            \n\t"
-                           "subl $4, %%ecx          \n\t"
-                           "jnz .loop1_pass2        \n\t"
-                           "EMMS                    \n\t" // DONE
-
-                           : "=c" (dummy_value_c),        // output regs (dummy)
-                             "=S" (dummy_value_S),
-                             "=D" (dummy_value_D)
-
-                           : "0" (width_mmx),     // ecx  // input regs
-                             "1" (sptr),          // esi/rsi
-                             "2" (dp)             // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                           : "%mm0", "%mm1"               // clobber list
-#endif
-                        );
-                     }
-
-                     sptr -= width_mmx;
-                     dp -= width_mmx*4;
-                     for (i = width; i; i--)
-                     {
-                        int j;
-
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           *dp-- = *sptr;
-                        }
-                        --sptr;
-                     }
-                  }
-                  else if (width)  // && ((pass == 0) || (pass == 1))
-                  {
-                     int width_mmx = ((width >> 2) << 2);
-                     width -= width_mmx;        // 0-3 pixels => 0-3 bytes
-                     if (width_mmx)
-                     {
-                        __asm__ __volatile__ (
-                           "sub  $3, %1             \n\t"
-                           "sub  $31, %2            \n\t"
-
-                        ".loop1_pass0:              \n\t"
-                           "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
-                           "movq %%mm0, %%mm1       \n\t" // x x x x 3 2 1 0
-                           "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
-                           "movq %%mm0, %%mm2       \n\t" // 3 3 2 2 1 1 0 0
-                           "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
-                           "movq %%mm0, %%mm3       \n\t" // 1 1 1 1 0 0 0 0
-                           "punpckldq %%mm0, %%mm0  \n\t" // 0 0 0 0 0 0 0 0
-                           "punpckhdq %%mm3, %%mm3  \n\t" // 1 1 1 1 1 1 1 1
-                           "movq %%mm0, (%2)        \n\t"
-                           "punpckhwd %%mm2, %%mm2  \n\t" // 3 3 3 3 2 2 2 2
-                           "movq %%mm3, 8(%2)       \n\t"
-                           "movq %%mm2, %%mm4       \n\t" // 3 3 3 3 2 2 2 2
-                           "punpckldq %%mm2, %%mm2  \n\t" // 2 2 2 2 2 2 2 2
-                           "punpckhdq %%mm4, %%mm4  \n\t" // 3 3 3 3 3 3 3 3
-                           "movq %%mm2, 16(%2)      \n\t"
-                           "sub  $4, %1             \n\t"
-                           "movq %%mm4, 24(%2)      \n\t"
-                           "sub  $32, %2            \n\t"
-                           "subl $4, %%ecx          \n\t"
-                           "jnz .loop1_pass0        \n\t"
-                           "EMMS                    \n\t" // DONE
-
-                           : "=c" (dummy_value_c),        // output regs (dummy)
-                             "=S" (dummy_value_S),
-                             "=D" (dummy_value_D)
-
-                           : "0" (width_mmx),     // ecx  // input regs
-                             "1" (sptr),          // esi/rsi
-                             "2" (dp)             // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                           : "%mm0", "%mm1", "%mm2"       // clobber list
-                           , "%mm3", "%mm4"
-#endif
-                        );
-                     }
-
-                     sptr -= width_mmx;
-                     dp -= width_mmx*8;
-                     for (i = width; i; i--)
-                     {
-                        int j;
-
-                       /* I simplified this part in version 1.0.4e
-                        * here and in several other instances where
-                        * pixel_bytes == 1  -- GR-P
-                        *
-                        * Original code:
-                        *
-                        * png_byte v[8];
-                        * png_memcpy(v, sptr, pixel_bytes);
-                        * for (j = 0; j < png_pass_inc[pass]; j++)
-                        * {
-                        *    png_memcpy(dp, v, pixel_bytes);
-                        *    dp -= pixel_bytes;
-                        * }
-                        * sptr -= pixel_bytes;
-                        *
-                        * Replacement code is in the next three lines:
-                        */
-
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           *dp-- = *sptr;
-                        }
-                        --sptr;
-                     }
-                  }
-               } /* end of pixel_bytes == 1 */
-
-               //--------------------------------------------------------------
-               else if (pixel_bytes == BPP2)
-               {
-                  if (((pass == 4) || (pass == 5)) && width)
-                  {
-                     int width_mmx = ((width >> 1) << 1) ;
-                     width -= width_mmx;        // 0,1 pixels => 0,2 bytes
-                     if (width_mmx)
-                     {
-                        __asm__ __volatile__ (
-                           "sub  $2, %1             \n\t"
-                           "sub  $6, %2             \n\t"
-
-                        ".loop2_pass4:              \n\t"
-                           "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
-                           "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
-                           "sub  $4, %1             \n\t"
-                           "movq %%mm0, (%2)        \n\t"
-                           "sub  $8, %2             \n\t"
-                           "subl $2, %%ecx          \n\t"
-                           "jnz .loop2_pass4        \n\t"
-                           "EMMS                    \n\t" // DONE
-
-                           : "=c" (dummy_value_c),        // output regs (dummy)
-                             "=S" (dummy_value_S),
-                             "=D" (dummy_value_D)
-
-                           : "0" (width_mmx),     // ecx  // input regs
-                             "1" (sptr),          // esi/rsi
-                             "2" (dp)             // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                           : "%mm0"                       // clobber list
-#endif
-                        );
-                     }
-
-                     sptr -= (width_mmx*BPP2 - BPP2); // sign fixed
-                     dp -= (width_mmx*2*BPP2 - BPP2); // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= BPP2;
-                        png_memcpy(v, sptr, BPP2);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= BPP2;
-                           png_memcpy(dp, v, BPP2);
-                        }
-                     }
-                  }
-                  else if (((pass == 2) || (pass == 3)) && width)
-                  {
-                     int width_mmx = ((width >> 1) << 1) ;
-                     width -= width_mmx;        // 0,1 pixels => 0,2 bytes
-                     if (width_mmx)
-                     {
-                        __asm__ __volatile__ (
-                           "sub  $2, %1             \n\t"
-                           "sub  $14, %2            \n\t"
-
-                        ".loop2_pass2:              \n\t"
-                           "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
-                           "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
-                           "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
-                           "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
-                           "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
-                           "movq %%mm0, (%2)        \n\t"
-                           "sub  $4, %1             \n\t"
-                           "movq %%mm1, 8(%2)       \n\t"
-                           "sub  $16, %2            \n\t"
-                           "subl $2, %%ecx          \n\t"
-                           "jnz .loop2_pass2        \n\t"
-                           "EMMS                    \n\t" // DONE
-
-                           : "=c" (dummy_value_c),        // output regs (dummy)
-                             "=S" (dummy_value_S),
-                             "=D" (dummy_value_D)
-
-                           : "0" (width_mmx),     // ecx  // input regs
-                             "1" (sptr),          // esi/rsi
-                             "2" (dp)             // edi/rdi
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-                           : "%mm0", "%mm1"               // clobber list
-#endif
-                        );
-                     }
-
-                     sptr -= (width_mmx*2 - 2); // sign fixed
-                     dp -= (width_mmx*8 - 2);   // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= 2;
-                        png_memcpy(v, sptr, 2);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= 2;
-                           png_memcpy(dp, v, 2);
-                        }
-                     }
-                  }
-                  else if (width)  // && ((pass == 0) || (pass == 1))
-                  {
-                     int width_mmx = ((width >> 1) << 1);
-                     width -= width_mmx;        // 0,1 pixels => 0,2 bytes
-                     if (width_mmx)
-                     {
-                        __asm__ __volatile__ (
-                           "sub  $2, %1             \n\t"
-                           "sub  $30, %2            \n\t"
-
-                        ".loop2_pass0:              \n\t"
-                           "movd (%1), %%mm0        \n\t" // x x x x 3 2 1 0
-                           "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
-                           "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
-                           "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
-                           "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
-                           "movq %%mm0, (%2)        \n\t"
-                           "movq %%mm0, 8(%2)       \n\t"
-                           "movq %%mm1, 16(%2)      \n\t"
-                           "sub  $4, %1             \n\t"
-                           "movq %%mm1, 24(%2)      \n\t"
-                           "sub  $32, %2            \n\t"
-                           "subl $2, %%ecx          \n\t"
-                           "jnz .loop2_pass0        \n\t"
-                           "EMMS                    \n\t" // DONE
-
-                           : "=c" (dummy_value_c),        // output regs (dummy)
-                             "=S" (dummy_value_S),
-                             "=D" (dummy_value_D)
-
-                           : "0" (width_mmx),     // ecx  // input regs
-                             "1" (sptr),          // esi/rsi
-                             "2" (dp)             // edi/rdi
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-                           : "%mm0", "%mm1"               // clobber list
-#endif
-                        );
-                     }
-
-                     sptr -= (width_mmx*2 - 2); // sign fixed
-                     dp -= (width_mmx*16 - 2);  // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= 2;
-                        png_memcpy(v, sptr, 2);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= 2;
-                           png_memcpy(dp, v, 2);
-                        }
-                     }
-                  }
-               } /* end of pixel_bytes == 2 */
-
-               //--------------------------------------------------------------
-               else if (pixel_bytes == BPP8)
-               {
-// GRR TEST:  should work, but needs testing (special 64-bit version of rpng2?)
-                  // GRR NOTE:  no need to combine passes here!
-                  if (((pass == 4) || (pass == 5)) && width)
-                  {
-                     // source is 8-byte RRGGBBAA
-                     // dest is 16-byte RRGGBBAA RRGGBBAA
-                     __asm__ __volatile__ (
-                        "sub  $8, %2             \n\t" // start of last block
-
-                     ".loop8_pass4:              \n\t"
-                        "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
-                        "movq %%mm0, (%2)        \n\t"
-                        "sub  $8, %1             \n\t"
-                        "movq %%mm0, 8(%2)       \n\t"
-                        "sub  $16, %2            \n\t"
-                        "decl %%ecx              \n\t"
-                        "jnz .loop8_pass4        \n\t"
-                        "EMMS                    \n\t" // DONE
-
-                        : "=c" (dummy_value_c),        // output regs (dummy)
-                          "=S" (dummy_value_S),
-                          "=D" (dummy_value_D)
-
-                        : "0" (width),         // ecx  // input regs
-                          "1" (sptr),          // esi/rsi
-                          "2" (dp)             // edi/rdi
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-                        : "%mm0"                       // clobber list
-#endif
-                     );
-                  }
-                  else if (((pass == 2) || (pass == 3)) && width)
-                  {
-                     // source is 8-byte RRGGBBAA
-                     // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
-                     // (recall that expansion is _in place_:  sptr and dp
-                     //  both point at locations within same row buffer)
-                     __asm__ __volatile__ (
-                        "sub  $24, %2            \n\t" // start of last block
-
-                     ".loop8_pass2:              \n\t"
-                        "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
-                        "movq %%mm0, (%2)        \n\t"
-                        "movq %%mm0, 8(%2)       \n\t"
-                        "movq %%mm0, 16(%2)      \n\t"
-                        "sub  $8, %1             \n\t"
-                        "movq %%mm0, 24(%2)      \n\t"
-                        "sub  $32, %2            \n\t"
-                        "decl %%ecx              \n\t"
-                        "jnz .loop8_pass2        \n\t"
-                        "EMMS                    \n\t" // DONE
-
-                        : "=c" (dummy_value_c),        // output regs (dummy)
-                          "=S" (dummy_value_S),
-                          "=D" (dummy_value_D)
-
-                        : "0" (width),         // ecx  // input regs
-                          "1" (sptr),          // esi/rsi
-                          "2" (dp)             // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                        : "%mm0"                       // clobber list
-#endif
-                     );
-                  }
-                  else if (width)  // && ((pass == 0) || (pass == 1))
-                  {
-                     // source is 8-byte RRGGBBAA
-                     // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
-                     __asm__ __volatile__ (
-                        "sub  $56, %2            \n\t" // start of last block
-
-                     ".loop8_pass0:              \n\t"
-                        "movq (%1), %%mm0        \n\t" // 7 6 5 4 3 2 1 0
-                        "movq %%mm0, (%2)        \n\t"
-                        "movq %%mm0, 8(%2)       \n\t"
-                        "movq %%mm0, 16(%2)      \n\t"
-                        "movq %%mm0, 24(%2)      \n\t"
-                        "movq %%mm0, 32(%2)      \n\t"
-                        "movq %%mm0, 40(%2)      \n\t"
-                        "movq %%mm0, 48(%2)      \n\t"
-                        "sub  $8, %1             \n\t"
-                        "movq %%mm0, 56(%2)      \n\t"
-                        "sub  $64, %2            \n\t"
-                        "decl %%ecx              \n\t"
-                        "jnz .loop8_pass0        \n\t"
-                        "EMMS                    \n\t" // DONE
-
-                        : "=c" (dummy_value_c),        // output regs (dummy)
-                          "=S" (dummy_value_S),
-                          "=D" (dummy_value_D)
-
-                        : "0" (width),         // ecx  // input regs
-                          "1" (sptr),          // esi/rsi
-                          "2" (dp)             // edi/rdi
-
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-                        : "%mm0"                       // clobber list
-#endif
-                     );
-                  }
-               } /* end of pixel_bytes == 8 */
-
-               //--------------------------------------------------------------
-               else if (pixel_bytes == BPP6)   // why no MMX for this case?
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, BPP6);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, BPP6);
-                        dp -= BPP6;
-                     }
-                     sptr -= BPP6;
-                  }
-               } /* end of pixel_bytes == 6 */
-
-               //--------------------------------------------------------------
-               else
-               {
-                  // ERROR:  SHOULD NEVER BE REACHED
-#if defined(PNG_DEBUG)
-                  png_debug(1, "Internal libpng logic error (GCC "
-                    "png_do_read_interlace() _mmx_supported)\n");
-#endif
-               }
-
-            } // end of _mmx_supported ========================================
-
-            else /* MMX not supported:  use modified C code - takes advantage
-                  *   of inlining of png_memcpy for a constant */
-            {
-               if (pixel_bytes == BPP3)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, BPP3);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, BPP3);
-                        dp -= BPP3;
-                     }
-                     sptr -= BPP3;
-                  }
-               }
-               else if (pixel_bytes == BPP4)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, BPP4);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-#if defined(PNG_DEBUG) && defined(PNG_1_0_X)  // row_buf_size gone in 1.2.x
-                        if (dp < row || dp+3 > row+png_ptr->row_buf_size)
-                        {
-                           printf("dp out of bounds: row=%10p, dp=%10p, "
-                             "rp=%10p\n", row, dp, row+png_ptr->row_buf_size);
-                           printf("row_buf_size=%lu\n", png_ptr->row_buf_size);
-                        }
-#endif
-                        png_memcpy(dp, v, BPP4);
-                        dp -= BPP4;
-                     }
-                     sptr -= BPP4;
-                  }
-               }
-               else if (pixel_bytes == 1)
-               {
-                  for (i = width; i; i--)
-                  {
-                     int j;
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        *dp-- = *sptr;
-                     }
-                     --sptr;
-                  }
-               }
-               else if (pixel_bytes == BPP2)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, BPP2);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, BPP2);
-                        dp -= BPP2;
-                     }
-                     sptr -= BPP2;
-                  }
-               }
-               else if (pixel_bytes == BPP6)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, BPP6);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, BPP6);
-                        dp -= BPP6;
-                     }
-                     sptr -= BPP6;
-                  }
-               }
-               else if (pixel_bytes == BPP8)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, BPP8);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, BPP8);
-                        dp -= BPP8;
-                     }
-                     sptr -= BPP8;
-                  }
-               }
-               else
-               {
-                  // ERROR:  SHOULD NEVER BE REACHED
-#if defined(PNG_DEBUG)
-                  png_debug(1, "Internal libpng logic error (GCC "
-                    "png_do_read_interlace() !_mmx_supported)\n");
-#endif
-               }
-
-            } /* end if (MMX not supported) */
-            break;
-         } /* end default (8-bit or larger) */
-      } /* end switch (row_info->pixel_depth) */
-
-      row_info->width = final_width;
-
-      row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
-   }
-
-} /* end png_do_read_interlace() */
-
-#endif /* PNG_HAVE_MMX_READ_INTERLACE */
-#endif /* PNG_READ_INTERLACING_SUPPORTED */
-
-
-
-#if defined(PNG_HAVE_MMX_READ_FILTER_ROW)
-#if defined(PNG_MMX_READ_FILTER_AVG_SUPPORTED)
-
-//===========================================================================//
-//                                                                           //
-//           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G           //
-//                                                                           //
-//===========================================================================//
-
-// Optimized code for PNG Average filter decoder
-
-static void /* PRIVATE */
-png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
-                            png_bytep prev_row)
-{
-   unsigned FullLength, MMXLength;  // png_uint_32 is actually 64-bit on x86-64
-   int bpp;
-   int dummy_value_a;
-   int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
-   int dummy_value_d;
-   png_bytep dummy_value_S;
-   png_bytep dummy_value_D;
-   int diff; //     __attribute__((used));
-
-   bpp = (row_info->pixel_depth + 7) >> 3;  // calc number of bytes per pixel
-   FullLength = row_info->rowbytes;         // number of bytes to filter
-
-   __asm__ __volatile__ (
-   "avg_top:                       \n\t"
-      SAVE_GOT_ebx
-      SAVE_r15
-      SAVE_ebp
-      // initialize address pointers and offset
-//pre "movl row, %5                \n\t" // edi/rdi:  ptr to Avg(x)
-      "xorl %%ebx, %%ebx           \n\t" // ebx:  x
-//pre "movl prev_row, %4           \n\t" // esi/rsi:  ptr to Prior(x)
-      "mov  %5, " PDX "            \n\t" // copy of row ptr...
-//pre "subl bpp, " PDX "           \n\t" // (bpp is preloaded into ecx)
-      "sub  " PCX "," PDX "        \n\t" // edx/rdx:  ptr to Raw(x-bpp)
-//pre "movl FullLength, %%eax      \n\t" // bring in via eax...
-      SAVE_FullLength                    // ...but store for later use
-      "xorl %%eax, %%eax           \n\t"
-
-      // Compute the Raw value for the first bpp bytes
-      //    Raw(x) = Avg(x) + (Prior(x)/2)
-   "avg_rlp:                       \n\t"
-      "movb (%4," PBX ",), %%al    \n\t" // load al with Prior(x)
-      "incl %%ebx                  \n\t"
-      "shrb %%al                   \n\t" // divide by 2
-      "addb -1(%5," PBX ",), %%al  \n\t" // add Avg(x); -1 to offset inc ebx
-//pre "cmpl bpp, %%ebx             \n\t" // (bpp is preloaded into ecx)
-      "cmpl %%ecx, %%ebx           \n\t"
-      "movb %%al, -1(%5," PBX ",)  \n\t" // write Raw(x); -1 to offset inc ebx
-      "jb avg_rlp                  \n\t" // mov does not affect flags
-
-      // get # of bytes to alignment (32-bit mask _would_ be good enough
-      // [computing delta], but 32-bit ops are zero-extended on 64-bit, argh)
-      // (if swapped edx and ebp, could do 8-bit or 16-bit mask...FIXME?)
-      "mov  %5, " PBP "            \n\t" // take start of row
-      "add  " PBX "," PBP "        \n\t" // add bpp
-      "add  $0xf, " PBP "          \n\t" // add 7+8 to incr past alignment bdry
-//    "andl $0xfffffff8, %%ebp     \n\t" // mask to alignment boundary (32-bit!)
-      CLEAR_BOTTOM_3_BITS  PBP    "\n\t" // mask to alignment boundary
-      "sub  %5, " PBP "            \n\t" // subtract row ptr again => ebp =
-      "jz avg_go                   \n\t" //  target value of ebx at alignment
-
-      "xorl %%ecx, %%ecx           \n\t"
-
-      // fix alignment
-      // Compute the Raw value for the bytes up to the alignment boundary
-      //    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
-   "avg_lp1:                       \n\t"
-      "xorl %%eax, %%eax           \n\t"
-      "movb (%4," PBX ",), %%cl    \n\t" // load cl with Prior(x)
-      "movb (" PDX "," PBX ",), %%al \n\t" // load al with Raw(x-bpp)
-      "addw %%cx, %%ax             \n\t"
-      "incl %%ebx                  \n\t"
-      "shrw %%ax                   \n\t" // divide by 2
-      "addb -1(%5," PBX ",), %%al  \n\t" // add Avg(x); -1 to offset inc ebx
-      "cmpl %%ebp, %%ebx           \n\t" // check if at alignment boundary
-      "movb %%al, -1(%5," PBX ",)  \n\t" // write Raw(x); -1 to offset inc ebx
-      "jb avg_lp1                  \n\t" // repeat until at alignment boundary
-
-   "avg_go:                        \n\t"
-      RESTORE_FullLength "%%eax    \n\t" // FullLength -> eax
-      "movl %%eax, %%ecx           \n\t" // copy -> ecx
-      "subl %%ebx, %%eax           \n\t" // subtract alignment fix
-      "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
-      "subl %%eax, %%ecx           \n\t" // sub over-bytes from original length
-//out "movl %%ecx, MMXLength       \n\t"
-      "movl %%ebp, %%eax           \n\t" // ebp = diff, but no reg constraint(?)
-      RESTORE_ebp                        //  (could swap ebp and edx functions)
-      RESTORE_r15
-      RESTORE_GOT_ebx
-
-// "There is no way for you to specify that an input operand is modified
-// without also specifying it as an output operand."  [makes sense]
-
-// "Unless an output operand has the `&' constraint modifier, GCC may
-// allocate it in the same register as an unrelated input operand, on the
-// assumption the inputs are consumed before the outputs are produced."
-// [trying to _force_ this]
-
-// "`='   Means that this operand is write-only for this instruction:
-//        the previous value is discarded and replaced by output data."
-//        [operand == variable name, presumably]
-
-      // output regs
-      // these are operands 0-1 (originally 0-3):
-      : "=c" (MMXLength),      // %0 -> %0
-        "=a" (diff)            // %3 -> %1
-//      "=S" (dummy_value_S),  // %1 -> GONE
-//      "=D" (dummy_value_D),  // %2 -> GONE
-
-      // input regs
-      // these are operands 2-5 (originally 4-7); two of their constraints say
-      // they must go in same places as operands 0-1 (originally 0-3) above:
-      : "0" (bpp),         // %4 -> %2 ecx
-        "1" (FullLength),  // %7 -> %3 eax
-        "S" (prev_row),    // %5 -> %4 esi/rsi
-        "D" (row)          // %6 -> %5 edi/rdi
-
-      : "%edx"                           // clobber list
-        _CLOBBER_r15
-        _CLOBBER_ebp
-        _CLOBBER_GOT_ebx
-   );
-
-   // now do the math for the rest of the row
-   switch (bpp)
-   {
-      case 3:
-      {
-//       _ShiftBpp = 24;    // == 3 * 8
-//       _ShiftRem = 40;    // == 64 - 24
-
-         __asm__ __volatile__ (
-            // re-init address pointers and offset
-            LOAD_GOT_rbp
-            "movq " AMASK5_3_0 ", %%mm7    \n\t" // _amask5_3_0 -> mm7
-// preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
-                                                 //  alignment boundary
-            "movq " LB_CARRY_MASK ", %%mm5 \n\t" // [interleave for parallel.?]
-// preload  "movl  row, %1                 \n\t" // edi:  Avg(x)
-            "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
-// preload  "movl  prev_row, %0            \n\t" // esi:  Prior(x)
-            RESTORE_rbp
-
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq  -8(%1," PCX ",), %%mm2 \n\t"// load previous aligned 8 bytes
-                                               // (correct pos. in loop below)
-         "avg_3lp:                        \n\t"
-            "movq  (%1," PCX ",), %%mm0   \n\t" // load mm0 with Avg(x)
-            "movq  %%mm5, %%mm3           \n\t"
-            "psrlq $40, %%mm2             \n\t" // correct position Raw(x-bpp)
-                                                // data
-            "movq  (%0," PCX ",), %%mm1   \n\t" // load mm1 with Prior(x)
-            "movq  %%mm7, %%mm6           \n\t"
-            "pand  %%mm1, %%mm3           \n\t" // get lsb for each prevrow byte
-            "psrlq $1, %%mm1              \n\t" // divide prev_row bytes by 2
-            "pand  %%mm4, %%mm1           \n\t" // clear invalid bit 7 of each
-                                                // byte
-            "paddb %%mm1, %%mm0           \n\t" // add (Prev_row/2) to Avg for
-                                                // each byte
-            // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
-            "movq  %%mm3, %%mm1           \n\t" // now use mm1 for getting
-                                                // LBCarrys
-            "pand  %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
-                                                // where both lsb's were == 1
-                                                // (valid only for active group)
-            "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
-                                                // byte
-            "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to Raw(x-bpp)/2
-                                                // for each byte
-            "pand  %%mm6, %%mm2           \n\t" // leave only Active Group 1
-                                                // bytes to add to Avg
-            "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
-                                                // Avg for each Active byte
-            // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
-            "psllq $24, %%mm6             \n\t" // shift the mm6 mask to cover
-                                                // bytes 3-5
-            "movq  %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "psllq $24, %%mm2             \n\t" // shift data to pos. correctly
-            "movq  %%mm3, %%mm1           \n\t" // now use mm1 for getting
-                                                // LBCarrys
-            "pand  %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
-                                                // where both lsb's were == 1
-                                                // (valid only for active group)
-            "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
-                                                // byte
-            "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to Raw(x-bpp)/2
-                                                // for each byte
-            "pand  %%mm6, %%mm2           \n\t" // leave only Active Group 2
-                                                // bytes to add to Avg
-            "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
-                                                // Avg for each Active byte
-
-            // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
-            "psllq $24, %%mm6             \n\t" // shift mm6 mask to cover last
-                                                // two bytes
-            "movq  %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "psllq $24, %%mm2             \n\t" // shift data to pos. correctly
-                              // Data need be shifted only once here to
-                              // get the correct x-bpp offset.
-            "movq  %%mm3, %%mm1           \n\t" // now use mm1 for getting
-                                                // LBCarrys
-            "pand  %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
-                                                // where both
-                              // lsb's were == 1 (only valid for active group)
-            "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
-                                                // byte
-            "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to Raw(x-bpp)/2
-                                                // for each byte
-            "pand  %%mm6, %%mm2           \n\t" // leave only Active Group 2
-                                                // bytes to add to Avg
-            "addl  $8, %%ecx              \n\t"
-            "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
-                                                // Avg for each Active byte
-            // now ready to write back to memory
-            "movq  %%mm0, -8(%1," PCX ",) \n\t"
-            // move updated Raw(x) to use as Raw(x-bpp) for next loop
-            "cmpl  %%eax, %%ecx           \n\t" // MMXLength
-            "movq  %%mm0, %%mm2           \n\t" // mov updated Raw(x) to mm2
-            "jb avg_3lp                   \n\t"
-
-            : "=S" (dummy_value_S),            // output regs (dummy)
-              "=D" (dummy_value_D),
-              "=c" (dummy_value_c),
-              "=a" (dummy_value_a)
-
-            : "0" (prev_row),    // esi/rsi    // input regs
-              "1" (row),         // edi/rdi
-              "2" (diff),        // ecx
-              "3" (MMXLength)    // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
-            , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-         );
-      }
-      break;  // end 3 bpp
-
-      case 4:   // formerly shared with 6 bpp case via _ShiftBpp and _ShiftRem,
-      {         // but loop uses all 8 MMX regs, and psrlq/psllq require 64-bit
-                // mem (PIC/.so problems), MMX reg (none left), or immediate
-//       _ShiftBpp = bpp << 3;        // 32 (psllq)
-//       _ShiftRem = 64 - _ShiftBpp;  // 32 (psrlq)
-
-         __asm__ __volatile__ (
-            LOAD_GOT_rbp
-            "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
-            "movq " LB_CARRY_MASK ", %%mm5 \n\t" // _LBCarryMask -> mm5
-            // re-init address pointers and offset
-// preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
-                                                 // alignment boundary
-            "movq " AMASK0_8_0 ", %%mm7    \n\t" // _amask0_8_0 -> mm7
-            RESTORE_rbp
-
-            // ... and clear all bytes except for 1st active group
-// preload  "movl  row, %1               \n\t" // edi:  Avg(x)
-            "psrlq $32, %%mm7            \n\t" // was _ShiftRem
-// preload  "movl  prev_row, %0          \n\t" // esi:  Prior(x)
-            "movq  %%mm7, %%mm6          \n\t"
-            "psllq $32, %%mm6            \n\t" // mask for 2nd active group
-
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
-                                             // (we correct pos. in loop below)
-         "avg_4lp:                       \n\t"
-            "movq (%1," PCX ",), %%mm0   \n\t"
-            "psrlq $32, %%mm2            \n\t" // shift data to pos. correctly
-            "movq (%0," PCX ",), %%mm1   \n\t"
-            // add (Prev_row/2) to average
-            "movq %%mm5, %%mm3           \n\t"
-            "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
-            "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
-            "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
-                                               // byte
-            "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
-                                               // each byte
-            // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
-            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
-                                               // LBCarrys
-            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
-                                               // where both
-                              // lsb's were == 1 (only valid for active group)
-            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
-                                               // byte
-            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
-                                               // for each byte
-            "pand %%mm7, %%mm2           \n\t" // leave only Active Group 1
-                                               // bytes to add to Avg
-            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
-                                               // for each Active
-                              // byte
-            // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
-            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "psllq $32, %%mm2            \n\t" // shift data to pos. correctly
-            "addl $8, %%ecx              \n\t"
-            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
-                                               // LBCarrys
-            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
-                                               // where both
-                              // lsb's were == 1 (only valid for active group)
-            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
-                                               // byte
-            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
-                                               // for each byte
-            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
-                                               // bytes to add to Avg
-            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
-                                               // Avg for each Active byte
-            "cmpl %%eax, %%ecx           \n\t" // MMXLength
-            // now ready to write back to memory
-            "movq %%mm0, -8(%1," PCX ",) \n\t"
-            // prep Raw(x-bpp) for next loop
-            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "jb avg_4lp                  \n\t"
-
-            : "=S" (dummy_value_S),            // output regs (dummy)
-              "=D" (dummy_value_D),
-              "=c" (dummy_value_c),
-              "=a" (dummy_value_a)
-
-            : "0" (prev_row),    // esi/rsi    // input regs
-              "1" (row),         // edi/rdi
-              "2" (diff),        // ecx
-              "3" (MMXLength)    // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
-            , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-         );
-      }
-      break;  // end 4 bpp
-
-      case 1:
-      {
-         __asm__ __volatile__ (
-            // re-init address pointers and offset
-// preload  "movl diff, %%ecx            \n\t" // ecx: x = offset to align. bdry
-// preload  "movl row, %1                \n\t" // edi/rdi:  Avg(x)
-// preload  "movl FullLength, %%eax      \n\t"
-            "cmpl %%eax, %%ecx           \n\t" // test if offset at end of array
-            "jnb avg_1end                \n\t"
-
-            SAVE_ebp
-
-            // do Avg decode for remaining bytes
-// preload  "movl prev_row, %0           \n\t" // esi/rsi:  Prior(x)
-            "mov  %1, " PBP "            \n\t" // copy of row pointer...
-            "dec  " PBP "                \n\t" // ebp/rbp:  Raw(x-bpp)
-            "xorl %%edx, %%edx           \n\t" // zero edx before using dl & dx
-                                               //  in loop below
-            SAVE_GOT_ebx
-
-         "avg_1lp:                       \n\t"
-            // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
-            "xorl %%ebx, %%ebx           \n\t"
-            "movb (%0," PCX ",), %%dl    \n\t" // load dl with Prior(x)
-            "movb (" PBP "," PCX ",), %%bl \n\t" // load bl with Raw(x-bpp)
-            "addw %%dx, %%bx             \n\t"
-            "incl %%ecx                  \n\t"
-            "shrw %%bx                   \n\t" // divide by 2
-            "addb -1(%1," PCX ",), %%bl  \n\t" // add Avg(x); -1 to offset
-                                               // inc ecx
-            "cmpl %%eax, %%ecx           \n\t" // check if at end of array
-            "movb %%bl, -1(%1," PCX ",)  \n\t" // write back Raw(x);
-                         // mov does not affect flags; -1 to offset inc ecx
-            "jb avg_1lp                  \n\t"
-
-            RESTORE_GOT_ebx
-            RESTORE_ebp
-
-         "avg_1end:                      \n\t"
-
-            : "=S" (dummy_value_S),            // output regs (dummy)
-              "=D" (dummy_value_D),
-              "=c" (dummy_value_c),
-              "=a" (dummy_value_a)
-
-            : "0" (prev_row),    // esi/rsi    // input regs
-              "1" (row),         // edi/rdi
-              "2" (diff),        // ecx
-              "3" (FullLength)   // eax
-
-            : "%edx"                           // clobber list
-              _CLOBBER_GOT_ebx
-              _CLOBBER_ebp
-         );
-      }
-      return;  // end 1 bpp
-
-      case 2:
-      {
-//       _ShiftBpp = 16;   // == 2 * 8
-//       _ShiftRem = 48;   // == 64 - _ShiftBpp
-
-         __asm__ __volatile__ (
-            LOAD_GOT_rbp
-            // load (former) _ActiveMask
-            "movq " AMASK6_2_0 ", %%mm7    \n\t" // _amask6_2_0 -> mm7
-            // re-init address pointers and offset
-// preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
-                                                 // alignment boundary
-            "movq " LB_CARRY_MASK ", %%mm5 \n\t" // _LBCarryMask -> mm5
-// preload  "movl  row, %1                 \n\t" // edi:  Avg(x)
-            "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
-// preload  "movl  prev_row, %0            \n\t" // esi:  Prior(x)
-            RESTORE_rbp
-
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
-                                             // (we correct pos. in loop below)
-         "avg_2lp:                       \n\t"
-            "movq (%1," PCX ",), %%mm0   \n\t"
-            "psrlq $48, %%mm2            \n\t" // shift data to pos. correctly
-            "movq (%0," PCX ",), %%mm1   \n\t" //  (GRR BUGFIX:  was psllq)
-            // add (Prev_row/2) to average
-            "movq %%mm5, %%mm3           \n\t"
-            "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
-            "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
-            "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
-                                               // byte
-            "movq %%mm7, %%mm6           \n\t"
-            "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
-                                               // each byte
-
-            // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
-            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
-                                               // LBCarrys
-            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
-                                               // where both
-                                               // lsb's were == 1 (only valid
-                                               // for active group)
-            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
-                                               // byte
-            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
-                                               // for each byte
-            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 1
-                                               // bytes to add to Avg
-            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
-                                               // for each Active byte
-
-            // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
-            "psllq $16, %%mm6            \n\t" // shift the mm6 mask to cover
-                                               // bytes 2 & 3
-            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "psllq $16, %%mm2            \n\t" // shift data to pos. correctly
-            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
-                                               // LBCarrys
-            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
-                                               // where both
-                                               // lsb's were == 1 (only valid
-                                               // for active group)
-            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
-                                               // byte
-            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
-                                               // for each byte
-            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
-                                               // bytes to add to Avg
-            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
-                                               // Avg for each Active byte
-
-            // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
-            "psllq $16, %%mm6            \n\t" // shift the mm6 mask to cover
-                                               // bytes 4 & 5
-            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "psllq $16, %%mm2            \n\t" // shift data to pos. correctly
-            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
-                                               // LBCarrys
-            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
-                                               // where both lsb's were == 1
-                                               // (only valid for active group)
-            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
-                                               // byte
-            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
-                                               // for each byte
-            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
-                                               // bytes to add to Avg
-            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
-                                               // Avg for each Active byte
-
-            // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
-            "psllq $16, %%mm6            \n\t" // shift the mm6 mask to cover
-                                               // bytes 6 & 7
-            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "psllq $16, %%mm2            \n\t" // shift data to pos. correctly
-            "addl $8, %%ecx              \n\t"
-            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
-                                               // LBCarrys
-            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
-                                               // where both
-                                               // lsb's were == 1 (only valid
-                                               // for active group)
-            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
-                                               // byte
-            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
-                                               // for each byte
-            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
-                                               // bytes to add to Avg
-            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
-                                               // Avg for each Active byte
-            "cmpl %%eax, %%ecx           \n\t" // MMXLength
-            // now ready to write back to memory
-            "movq %%mm0, -8(%1," PCX ",) \n\t"
-            // prep Raw(x-bpp) for next loop
-            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "jb avg_2lp                  \n\t"
-
-            : "=S" (dummy_value_S),            // output regs (dummy)
-              "=D" (dummy_value_D),
-              "=c" (dummy_value_c),
-              "=a" (dummy_value_a)
-
-            : "0" (prev_row),    // esi/rsi    // input regs
-              "1" (row),         // edi/rdi
-              "2" (diff),        // ecx
-              "3" (MMXLength)    // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
-            , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-         );
-      }
-      break;  // end 2 bpp
-
-      case 6:   // formerly shared with 4 bpp case (see comments there)
-      {
-//       _ShiftBpp = bpp << 3;        // 48 (psllq)
-//       _ShiftRem = 64 - _ShiftBpp;  // 16 (psrlq)
-
-         __asm__ __volatile__ (
-            LOAD_GOT_rbp
-            "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
-            "movq " LB_CARRY_MASK ", %%mm5 \n\t" // _LBCarryMask -> mm5
-            // re-init address pointers and offset
-// preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
-                                                 // alignment boundary
-            "movq " AMASK0_8_0 ", %%mm7    \n\t" // _amask0_8_0 -> mm7
-            RESTORE_rbp
-
-            // ... and clear all bytes except for 1st active group
-// preload  "movl  row, %1               \n\t" // edi:  Avg(x)
-            "psrlq $16, %%mm7            \n\t"
-// preload  "movl  prev_row, %0          \n\t" // esi:  Prior(x)
-            "movq  %%mm7, %%mm6          \n\t"
-            "psllq $48, %%mm6            \n\t" // mask for 2nd active group
-
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
-                                             // (we correct pos. in loop below)
-         "avg_6lp:                       \n\t"
-            "movq (%1," PCX ",), %%mm0   \n\t"
-            "psrlq $16, %%mm2            \n\t" // shift data to pos. correctly
-            "movq (%0," PCX ",), %%mm1   \n\t"
-            // add (Prev_row/2) to average
-            "movq %%mm5, %%mm3           \n\t"
-            "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
-            "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
-            "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
-                                               // byte
-            "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
-                                               // each byte
-            // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
-            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
-                                               // LBCarrys
-            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
-                                               // where both
-                              // lsb's were == 1 (only valid for active group)
-            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
-                                               // byte
-            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
-                                               // for each byte
-            "pand %%mm7, %%mm2           \n\t" // leave only Active Group 1
-                                               // bytes to add to Avg
-            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
-                                               // for each Active
-                              // byte
-            // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
-            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "psllq $48, %%mm2            \n\t" // shift data to pos. correctly
-            "addl $8, %%ecx              \n\t"
-            "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
-                                               // LBCarrys
-            "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
-                                               // where both
-                              // lsb's were == 1 (only valid for active group)
-            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
-                                               // byte
-            "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
-                                               // for each byte
-            "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
-                                               // bytes to add to Avg
-            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
-                                               // Avg for each Active byte
-            "cmpl %%eax, %%ecx           \n\t" // MMXLength
-            // now ready to write back to memory
-            "movq %%mm0, -8(%1," PCX ",) \n\t"
-            // prep Raw(x-bpp) for next loop
-            "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
-            "jb avg_6lp                  \n\t"
-
-            : "=S" (dummy_value_S),            // output regs (dummy)
-              "=D" (dummy_value_D),
-              "=c" (dummy_value_c),
-              "=a" (dummy_value_a)
-
-            : "0" (prev_row),    // esi/rsi    // input regs
-              "1" (row),         // edi/rdi
-              "2" (diff),        // ecx
-              "3" (MMXLength)    // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
-            , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-         );
-      }
-      break;  // end 6 bpp
-
-      case 8:
-      {
-         __asm__ __volatile__ (
-            // re-init address pointers and offset
-// preload  "movl  diff, %%ecx             \n\t" // ecx:  x = offset to
-                                                 // alignment boundary
-            LOAD_GOT_rbp
-            "movq " LB_CARRY_MASK ", %%mm5 \n\t" // [interleave for parallel.?]
-// preload  "movl  row, %1                 \n\t" // edi:  Avg(x)
-            "movq " HB_CLEAR_MASK ", %%mm4 \n\t" // _HBClearMask -> mm4
-// preload  "movl  prev_row, %0            \n\t" // esi:  Prior(x)
-            RESTORE_rbp
-
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PCX ",), %%mm2 \n\t" // load previous aligned 8 bytes
-                                      // (NO NEED to correct pos. in loop below)
-
-         "avg_8lp:                       \n\t"
-            "movq (%1," PCX ",), %%mm0   \n\t"
-            "movq %%mm5, %%mm3           \n\t"
-            "movq (%0," PCX ",), %%mm1   \n\t"
-            "addl $8, %%ecx              \n\t"
-            "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
-            "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
-            "pand %%mm2, %%mm3           \n\t" // get LBCarrys for each byte
-                                               //  where both lsb's were == 1
-            "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
-            "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7, each byte
-            "paddb %%mm3, %%mm0          \n\t" // add LBCarrys to Avg, each byte
-            "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7, each byte
-            "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg, each
-            "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) to Avg for each
-            "cmpl %%eax, %%ecx           \n\t" // MMXLength
-            "movq %%mm0, -8(%1," PCX ",) \n\t"
-            "movq %%mm0, %%mm2           \n\t" // reuse as Raw(x-bpp)
-            "jb avg_8lp                  \n\t"
-
-            : "=S" (dummy_value_S),            // output regs (dummy)
-              "=D" (dummy_value_D),
-              "=c" (dummy_value_c),
-              "=a" (dummy_value_a)
-
-            : "0" (prev_row),    // esi/rsi    // input regs
-              "1" (row),         // edi/rdi
-              "2" (diff),        // ecx
-              "3" (MMXLength)    // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1", "%mm2"           // clobber list
-            , "%mm3", "%mm4", "%mm5"
-#endif
-         );
-      }
-      break;  // end 8 bpp
-
-      default:                // bpp != 1,2,3,4,6,8:  doesn't exist
-      {
-         // ERROR:  SHOULD NEVER BE REACHED
-#if defined(PNG_DEBUG)
-         png_debug(1, "Internal libpng logic error (GCC "
-           "png_read_filter_row_mmx_avg())\n");
-#endif
-      }
-      break;
-
-   } // end switch (bpp)
-
-   __asm__ __volatile__ (
-      // MMX acceleration complete; now do clean-up
-      // check if any remaining bytes left to decode
-//pre "movl FullLength, %%edx      \n\t"
-//pre "movl MMXLength, %%eax       \n\t" // eax:  x == offset bytes after MMX
-//pre "movl row, %2                \n\t" // edi:  Avg(x)
-      "cmpl %%edx, %%eax           \n\t" // test if offset at end of array
-      "jnb avg_end                 \n\t"
-
-      SAVE_ebp
-
-      // do Avg decode for remaining bytes
-//pre "movl prev_row, %1           \n\t" // esi:  Prior(x)
-      "mov  %2, " PBP "            \n\t" // copy of row pointer...
-//pre "subl bpp, " PBP "           \n\t" // (bpp is preloaded into ecx)
-      "sub  " PCX "," PBP "        \n\t" // ebp:  Raw(x-bpp)
-      "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
-
-      SAVE_GOT_ebx
-
-   "avg_lp2:                       \n\t"
-      // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
-      "xorl %%ebx, %%ebx           \n\t"
-      "movb (%1," PAX ",), %%cl    \n\t" // load cl with Prior(x)
-      "movb (" PBP "," PAX ",), %%bl \n\t" // load bl with Raw(x-bpp)
-      "addw %%cx, %%bx             \n\t"
-      "incl %%eax                  \n\t"
-      "shrw %%bx                   \n\t" // divide by 2
-      "addb -1(%2," PAX ",), %%bl  \n\t" // add Avg(x); -1 to offset inc eax
-      "cmpl %%edx, %%eax           \n\t" // check if at end of array
-      "movb %%bl, -1(%2," PAX ",)  \n\t" // write back Raw(x) [mov does not
-      "jb avg_lp2                  \n\t" //  affect flags; -1 to offset inc eax]
-
-      RESTORE_GOT_ebx
-      RESTORE_ebp
-
-   "avg_end:                       \n\t"
-      "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
-
-      : "=c" (dummy_value_c),            // output regs (dummy)
-        "=S" (dummy_value_S),
-        "=D" (dummy_value_D),
-        "=a" (dummy_value_a),
-        "=d" (dummy_value_d)
-
-      : "0" (bpp),         // ecx        // input regs
-        "1" (prev_row),    // esi/rsi
-        "2" (row),         // edi/rdi
-        "3" (MMXLength),   // eax
-        "4" (FullLength)   // edx
-
-      CLOB_COLON_ebx_ebp                 // clobber list
-        CLOBBER_GOT_ebx
-        CLOB_COMMA_ebx_ebp
-        CLOBBER_ebp
-   );
-
-} /* end png_read_filter_row_mmx_avg() */
-
-#endif /* PNG_MMX_READ_FILTER_AVG_SUPPORTED */
-
-
-
-#if defined(PNG_MMX_READ_FILTER_PAETH_SUPPORTED)
-#if defined(PNG_x86_64_USE_GOTPCREL) || defined(PNG_THREAD_UNSAFE_OK)
-
-//===========================================================================//
-//                                                                           //
-//         P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H         //
-//                                                                           //
-//===========================================================================//
-
-// Optimized code for PNG Paeth filter decoder
-
-static void /* PRIVATE */
-png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
-                              png_bytep prev_row)
-{
-   unsigned FullLength, MMXLength;  // png_uint_32 is actually 64-bit on x86-64
-   int bpp;
-   int dummy_value_a;
-   int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
-   int dummy_value_d;
-   png_charp dummy_value_S;
-   png_charp dummy_value_D;
-   int diff; //     __attribute__((used));
-
-   bpp = (row_info->pixel_depth + 7) >> 3;  // calc number of bytes per pixel
-   FullLength = row_info->rowbytes;         // number of bytes to filter
-
-   __asm__ __volatile__ (
-      SAVE_GOT_ebx
-      SAVE_r15
-      SAVE_ebp
-//pre "movl row, %2                \n\t" // edi/rdi
-      "xorl %%ebx, %%ebx           \n\t" // ebx:  x offset
-//pre "movl prev_row, %1           \n\t" // esi/rsi
-      "xorl %%edx, %%edx           \n\t" // edx:  x-bpp offset
-//pre "movl FullLength, %%eax      \n\t" // bring in via eax...
-      SAVE_FullLength                    // ...but store for later use
-      "xorl %%eax, %%eax           \n\t"
-
-      // Compute the Raw value for the first bpp bytes
-      // Note: the formula works out to be always
-      //   Paeth(x) = Raw(x) + Prior(x)      where x < bpp
-   "paeth_rlp:                     \n\t"
-      "movb (%2," PBX ",), %%al    \n\t"
-      "addb (%1," PBX ",), %%al    \n\t"
-      "incl %%ebx                  \n\t"
-//pre "cmpl bpp, %%ebx             \n\t" (bpp is preloaded into ecx)
-      "cmpl %%ecx, %%ebx           \n\t"
-      "movb %%al, -1(%2," PBX ",)  \n\t"
-      "jb paeth_rlp                \n\t"
-
-      // get # of bytes to alignment (note:  computing _delta_ of two pointers,
-      // so hereafter %%ebp is sufficient even on 64-bit)
-      "mov  %2, " PBP "            \n\t" // take start of row
-      "add  " PBX "," PBP "        \n\t" // add bpp
-      "add  $0xf, " PBP "          \n\t" // add 7+8 to incr past alignment bdry
-//    "andl $0xfffffff8, %%ebp     \n\t" // mask to alignment boundary (32-bit!)
-      CLEAR_BOTTOM_3_BITS  PBP    "\n\t" // mask to alignment boundary
-      "sub  %2, " PBP "            \n\t" // subtract row ptr again => ebp =
-      "jz paeth_go                 \n\t" //  target value of ebx at alignment
-
-      "xorl %%ecx, %%ecx           \n\t"
-
-      SAVE_r11_r12_r13
-
-      // fix alignment
-   "paeth_lp1:                     \n\t"
-      "xorl %%eax, %%eax           \n\t"
-      // pav = p - a = (a + b - c) - a = b - c
-      "movb (%1," PBX ",), %%al    \n\t" // load Prior(x) into al
-      "movb (%1," PDX ",), %%cl    \n\t" // load Prior(x-bpp) into cl
-      "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
-      "movl %%eax, " pa_TEMP "     \n\t" // Save pav for later use
-      "xorl %%eax, %%eax           \n\t"
-      // pbv = p - b = (a + b - c) - b = a - c
-      "movb (%2," PDX ",), %%al    \n\t" // load Raw(x-bpp) into al
-      "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
-      "movl %%eax, %%ecx           \n\t"
-      // pcv = p - c = (a + b - c) - c = (a - c) + (b - c) = pav + pbv
-      "addl " pa_TEMP ", %%eax     \n\t" // pcv = pav + pbv
-      // pc = abs(pcv)
-      "testl $0x80000000, %%eax    \n\t"
-      "jz paeth_pca                \n\t"
-      "negl %%eax                  \n\t" // reverse sign of neg values
-
-   "paeth_pca:                     \n\t"
-      "movl %%eax, " pc_TEMP "     \n\t" // save pc for later use
-      // pb = abs(pbv)
-      "testl $0x80000000, %%ecx    \n\t"
-      "jz paeth_pba                \n\t"
-      "negl %%ecx                  \n\t" // reverse sign of neg values
-
-   "paeth_pba:                     \n\t"
-      "movl %%ecx, " pb_TEMP "     \n\t" // save pb for later use
-      // pa = abs(pav)
-      "movl " pa_TEMP ", %%eax     \n\t"
-      "testl $0x80000000, %%eax    \n\t"
-      "jz paeth_paa                \n\t"
-      "negl %%eax                  \n\t" // reverse sign of neg values
-
-   "paeth_paa:                     \n\t"
-      "movl %%eax, " pa_TEMP "     \n\t" // save pa for later use
-      // test if pa <= pb
-      "cmpl %%ecx, %%eax           \n\t"
-      "jna paeth_abb               \n\t"
-      // pa > pb; now test if pb <= pc
-      "cmpl " pc_TEMP ", %%ecx     \n\t"
-      "jna paeth_bbc               \n\t"
-      // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-      "movb (%1," PDX ",), %%cl    \n\t" // load Prior(x-bpp) into cl
-      "jmp paeth_paeth             \n\t"
-
-   "paeth_bbc:                     \n\t"
-      // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
-      "movb (%1," PBX ",), %%cl    \n\t" // load Prior(x) into cl
-      "jmp paeth_paeth             \n\t"
-
-   "paeth_abb:                     \n\t"
-      // pa <= pb; now test if pa <= pc
-      "cmpl " pc_TEMP ", %%eax     \n\t"
-      "jna paeth_abc               \n\t"
-      // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-      "movb (%1," PDX ",), %%cl    \n\t" // load Prior(x-bpp) into cl
-      "jmp paeth_paeth             \n\t"
-
-   "paeth_abc:                     \n\t"
-      // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
-      "movb (%2," PDX ",), %%cl    \n\t" // load Raw(x-bpp) into cl
-
-   "paeth_paeth:                   \n\t"
-      "incl %%ebx                  \n\t"
-      "incl %%edx                  \n\t"
-      // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
-      "addb %%cl, -1(%2," PBX ",)  \n\t"
-      "cmpl %%ebp, %%ebx           \n\t"
-      "jb paeth_lp1                \n\t"
-
-      RESTORE_r11_r12_r13
-
-   "paeth_go:                      \n\t"
-      RESTORE_FullLength "%%ecx    \n\t" // FullLength -> ecx
-      "movl %%ecx, %%eax           \n\t"
-      "subl %%ebx, %%eax           \n\t" // subtract alignment fix
-      "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
-      "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
-//out "movl %%ecx, MMXLength       \n\t"
-      "movl %%ebp, %%eax           \n\t" // ebp = diff, but no reg constraint(?)
-      RESTORE_ebp                        //  (could swap ebp and edx functions)
-      RESTORE_r15
-      RESTORE_GOT_ebx
-
-      : "=c" (MMXLength),                // output regs
-        "=S" (dummy_value_S),
-        "=D" (dummy_value_D),
-        "=a" (diff)
-
-      : "0" (bpp),         // ecx        // input regs
-        "1" (prev_row),    // esi/rsi
-        "2" (row),         // edi/rdi
-        "3" (FullLength)   // eax
-
-      : "%edx"                           // clobber list
-        _CLOBBER_r11_r12_r13
-        _CLOBBER_r15
-        _CLOBBER_ebp
-        _CLOBBER_GOT_ebx
-   );
-
-   // now do the math for the rest of the row
-   switch (bpp)
-   {
-      case 3:
-      {
-//       _ShiftBpp = 24;    // == bpp * 8
-//       _ShiftRem = 40;    // == 64 - _ShiftBpp
-
-         __asm__ __volatile__ (
-            LOAD_GOT_rbp
-// preload  "movl diff, %%ecx            \n\t"
-// preload  "movl row, %1                \n\t" // edi/rdi
-// preload  "movl prev_row, %0           \n\t" // esi/rsi
-            "pxor %%mm0, %%mm0           \n\t"
-
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PCX ",), %%mm1 \n\t"
-         "paeth_3lp:                     \n\t"
-            "psrlq $40, %%mm1            \n\t" // shift last 3 bytes to 1st
-                                               // 3 bytes
-            "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
-            "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
-            "movq -8(%0," PCX ",), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
-            "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
-            "psrlq $40, %%mm3            \n\t" // shift last 3 bytes to 1st
-                                               // 3 bytes
-            // pav = p - a = (a + b - c) - a = b - c
-            "movq %%mm2, %%mm4           \n\t"
-            "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
-            // pbv = p - b = (a + b - c) - b = a - c
-            "movq %%mm1, %%mm5           \n\t"
-            "psubw %%mm3, %%mm4          \n\t"
-            "pxor %%mm7, %%mm7           \n\t"
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            "movq %%mm4, %%mm6           \n\t"
-            "psubw %%mm3, %%mm5          \n\t"
-
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
-            "paddw %%mm5, %%mm6          \n\t"
-            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
-            "psubw %%mm0, %%mm4          \n\t"
-            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
-            "psubw %%mm0, %%mm4          \n\t"
-            "psubw %%mm7, %%mm5          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "psubw %%mm7, %%mm5          \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            //  test pa <= pb
-            "movq %%mm4, %%mm7           \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
-            "movq %%mm7, %%mm0           \n\t"
-            // use mm7 mask to merge pa & pb
-            "pand %%mm7, %%mm5           \n\t"
-            // use mm0 mask copy to merge a & b
-            "pand %%mm0, %%mm2           \n\t"
-            "pandn %%mm4, %%mm7          \n\t"
-            "pandn %%mm1, %%mm0          \n\t"
-            "paddw %%mm5, %%mm7          \n\t"
-            "paddw %%mm2, %%mm0          \n\t"
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
-            "pxor %%mm1, %%mm1           \n\t"
-            "pand %%mm7, %%mm3           \n\t"
-            "pandn %%mm0, %%mm7          \n\t"
-            "paddw %%mm3, %%mm7          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "packuswb %%mm1, %%mm7       \n\t"
-            "movq (%0," PCX ",), %%mm3   \n\t" // load c=Prior(x-bpp)
-            "pand " AMASK5_3_0 ", %%mm7  \n\t" // _amask5_3_0 (was _ActiveMask)
-            "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
-            "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
-            "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
-            "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
-            "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as
-                                               // Raw(x-bpp)
-            // now do Paeth for 2nd set of bytes (3-5)
-            "psrlq $24, %%mm2            \n\t" // load b=Prior(x) step 2
-            "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
-            "pxor %%mm7, %%mm7           \n\t"
-            "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
-            // pbv = p - b = (a + b - c) - b = a - c
-            "movq %%mm1, %%mm5           \n\t"
-            // pav = p - a = (a + b - c) - a = b - c
-            "movq %%mm2, %%mm4           \n\t"
-            "psubw %%mm3, %%mm5          \n\t"
-            "psubw %%mm3, %%mm4          \n\t"
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
-            //       pav + pbv = pbv + pav
-            "movq %%mm5, %%mm6           \n\t"
-            "paddw %%mm4, %%mm6          \n\t"
-
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm5, %%mm0        \n\t" // create mask pbv bytes < 0
-            "pcmpgtw %%mm4, %%mm7        \n\t" // create mask pav bytes < 0
-            "pand %%mm5, %%mm0           \n\t" // only pbv bytes < 0 in mm0
-            "pand %%mm4, %%mm7           \n\t" // only pav bytes < 0 in mm7
-            "psubw %%mm0, %%mm5          \n\t"
-            "psubw %%mm7, %%mm4          \n\t"
-            "psubw %%mm0, %%mm5          \n\t"
-            "psubw %%mm7, %%mm4          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "psubw %%mm0, %%mm6          \n\t"
-            //  test pa <= pb
-            "movq %%mm4, %%mm7           \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
-            "movq %%mm7, %%mm0           \n\t"
-            // use mm7 mask to merge pa & pb
-            "pand %%mm7, %%mm5           \n\t"
-            // use mm0 mask copy to merge a & b
-            "pand %%mm0, %%mm2           \n\t"
-            "pandn %%mm4, %%mm7          \n\t"
-            "pandn %%mm1, %%mm0          \n\t"
-            "paddw %%mm5, %%mm7          \n\t"
-            "paddw %%mm2, %%mm0          \n\t"
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
-            "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
-            "pand %%mm7, %%mm3           \n\t"
-            "pandn %%mm0, %%mm7          \n\t"
-            "pxor %%mm1, %%mm1           \n\t"
-            "paddw %%mm3, %%mm7          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "packuswb %%mm1, %%mm7       \n\t"
-            "movq %%mm2, %%mm3           \n\t" // load c=Prior(x-bpp) step 1
-            "pand " AMASK5_3_0 ", %%mm7  \n\t" // _amask5_3_0 (was _ActiveMask)
-            "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
-            "psllq $24, %%mm7            \n\t" // shift bytes to 2nd group of
-                                               // 3 bytes
-             // pav = p - a = (a + b - c) - a = b - c
-            "movq %%mm2, %%mm4           \n\t"
-            "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
-            "psllq $24, %%mm3            \n\t" // load c=Prior(x-bpp) step 2
-            "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
-            "movq %%mm7, %%mm1           \n\t"
-            "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
-            "psllq $24, %%mm1            \n\t" // shift bytes (was _ShiftBpp)
-                                    // now mm1 will be used as Raw(x-bpp)
-            // now do Paeth for 3rd, and final, set of bytes (6-7)
-            "pxor %%mm7, %%mm7           \n\t"
-            "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
-            "psubw %%mm3, %%mm4          \n\t"
-            // pbv = p - b = (a + b - c) - b = a - c
-            "movq %%mm1, %%mm5           \n\t"
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            "movq %%mm4, %%mm6           \n\t"
-            "psubw %%mm3, %%mm5          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "paddw %%mm5, %%mm6          \n\t"
-
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
-            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
-            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
-            "psubw %%mm0, %%mm4          \n\t"
-            "psubw %%mm7, %%mm5          \n\t"
-            "psubw %%mm0, %%mm4          \n\t"
-            "psubw %%mm7, %%mm5          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "psubw %%mm0, %%mm6          \n\t"
-            //  test pa <= pb
-            "movq %%mm4, %%mm7           \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
-            "movq %%mm7, %%mm0           \n\t"
-            // use mm0 mask copy to merge a & b
-            "pand %%mm0, %%mm2           \n\t"
-            // use mm7 mask to merge pa & pb
-            "pand %%mm7, %%mm5           \n\t"
-            "pandn %%mm1, %%mm0          \n\t"
-            "pandn %%mm4, %%mm7          \n\t"
-            "paddw %%mm2, %%mm0          \n\t"
-            "paddw %%mm5, %%mm7          \n\t"
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
-            "pand %%mm7, %%mm3           \n\t"
-            "pandn %%mm0, %%mm7          \n\t"
-            "paddw %%mm3, %%mm7          \n\t"
-            "pxor %%mm1, %%mm1           \n\t"
-            "packuswb %%mm7, %%mm1       \n\t"
-            // step ecx to next set of 8 bytes and repeat loop til done
-            "addl $8, %%ecx              \n\t"
-            "pand " AMASK0_2_6 ", %%mm1  \n\t" // _amask0_2_6 (_ActiveMaskEnd)
-            "paddb -8(%1," PCX ",), %%mm1 \n\t" // add Paeth predictor + Raw(x)
-            "cmpl %%eax, %%ecx           \n\t" // MMXLength
-            "pxor %%mm0, %%mm0           \n\t" // pxor does not affect flags
-            "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
-                                 // mm1 will be used as Raw(x-bpp) next loop
-                           // mm3 ready to be used as Prior(x-bpp) next loop
-            "jb paeth_3lp                \n\t"
-            RESTORE_rbp
-
-            : "=S" (dummy_value_S),            // output regs (dummy)
-              "=D" (dummy_value_D),
-              "=c" (dummy_value_c),
-              "=a" (dummy_value_a)
-
-            : "0" (prev_row),  // esi/rsi      // input regs
-              "1" (row),       // edi/rdi
-              "2" (diff),      // ecx
-              "3" (MMXLength)  // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
-            , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-         );
-      }
-      break;  // end 3 bpp
-
-      case 4:
-      {
-         __asm__ __volatile__ (
-// preload  "movl diff, %%ecx            \n\t"
-// preload  "movl row, %1                \n\t" // edi/rdi
-// preload  "movl prev_row, %0           \n\t" // esi/rsi
-            "pxor %%mm0, %%mm0           \n\t"
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PCX ",), %%mm1 \n\t" // only time should need to read
-                                               //  a=Raw(x-bpp) bytes
-         "paeth_4lp:                     \n\t"
-            // do first set of 4 bytes
-            "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
-            "punpckhbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
-            "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
-            "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
-            // pav = p - a = (a + b - c) - a = b - c
-            "movq %%mm2, %%mm4           \n\t"
-            "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
-            // pbv = p - b = (a + b - c) - b = a - c
-            "movq %%mm1, %%mm5           \n\t"
-            "psubw %%mm3, %%mm4          \n\t"
-            "pxor %%mm7, %%mm7           \n\t"
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            "movq %%mm4, %%mm6           \n\t"
-            "psubw %%mm3, %%mm5          \n\t"
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
-            "paddw %%mm5, %%mm6          \n\t"
-            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
-            "psubw %%mm0, %%mm4          \n\t"
-            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
-            "psubw %%mm0, %%mm4          \n\t"
-            "psubw %%mm7, %%mm5          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "psubw %%mm7, %%mm5          \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            //  test pa <= pb
-            "movq %%mm4, %%mm7           \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
-            "movq %%mm7, %%mm0           \n\t"
-            // use mm7 mask to merge pa & pb
-            "pand %%mm7, %%mm5           \n\t"
-            // use mm0 mask copy to merge a & b
-            "pand %%mm0, %%mm2           \n\t"
-            "pandn %%mm4, %%mm7          \n\t"
-            "pandn %%mm1, %%mm0          \n\t"
-            "paddw %%mm5, %%mm7          \n\t"
-            "paddw %%mm2, %%mm0          \n\t"
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
-            "pxor %%mm1, %%mm1           \n\t"
-            "pand %%mm7, %%mm3           \n\t"
-            "pandn %%mm0, %%mm7          \n\t"
-            "paddw %%mm3, %%mm7          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "packuswb %%mm1, %%mm7       \n\t"
-            "movq (%0," PCX ",), %%mm3   \n\t" // load c=Prior(x-bpp)
-            LOAD_GOT_rbp
-            "pand " AMASK4_4_0 ", %%mm7  \n\t" // _amask4_4_0 (was _ActiveMask)
-            RESTORE_rbp
-            "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
-            "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
-            "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
-            "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
-            "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as
-                                               // Raw(x-bpp)
-            // do second set of 4 bytes
-            "punpckhbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
-            "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
-            // pav = p - a = (a + b - c) - a = b - c
-            "movq %%mm2, %%mm4           \n\t"
-            // pbv = p - b = (a + b - c) - b = a - c
-            "movq %%mm1, %%mm5           \n\t"
-            "psubw %%mm3, %%mm4          \n\t"
-            "pxor %%mm7, %%mm7           \n\t"
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            "movq %%mm4, %%mm6           \n\t"
-            "psubw %%mm3, %%mm5          \n\t"
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
-            "paddw %%mm5, %%mm6          \n\t"
-            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
-            "psubw %%mm0, %%mm4          \n\t"
-            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
-            "psubw %%mm0, %%mm4          \n\t"
-            "psubw %%mm7, %%mm5          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "psubw %%mm7, %%mm5          \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            //  test pa <= pb
-            "movq %%mm4, %%mm7           \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
-            "movq %%mm7, %%mm0           \n\t"
-            // use mm7 mask to merge pa & pb
-            "pand %%mm7, %%mm5           \n\t"
-            // use mm0 mask copy to merge a & b
-            "pand %%mm0, %%mm2           \n\t"
-            "pandn %%mm4, %%mm7          \n\t"
-            "pandn %%mm1, %%mm0          \n\t"
-            "paddw %%mm5, %%mm7          \n\t"
-            "paddw %%mm2, %%mm0          \n\t"
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
-            "pxor %%mm1, %%mm1           \n\t"
-            "pand %%mm7, %%mm3           \n\t"
-            "pandn %%mm0, %%mm7          \n\t"
-            "pxor %%mm1, %%mm1           \n\t"
-            "paddw %%mm3, %%mm7          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            // step ecx to next set of 8 bytes and repeat loop til done
-            "addl $8, %%ecx              \n\t"
-            "packuswb %%mm7, %%mm1       \n\t"
-            "paddb -8(%1," PCX ",), %%mm1 \n\t" // add predictor with Raw(x)
-            "cmpl %%eax, %%ecx           \n\t" // MMXLength
-            "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
-                                 // mm1 will be used as Raw(x-bpp) next loop
-            "jb paeth_4lp                \n\t"
-
-            : "=S" (dummy_value_S),            // output regs (dummy)
-              "=D" (dummy_value_D),
-              "=c" (dummy_value_c),
-              "=a" (dummy_value_a)
-
-            : "0" (prev_row),  // esi/rsi      // input regs
-              "1" (row),       // edi/rdi
-              "2" (diff),      // ecx
-              "3" (MMXLength)  // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
-            , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-         );
-      }
-      break;  // end 4 bpp
-
-      case 1:
-      case 2:
-      {
-         __asm__ __volatile__ (
-// preload  "movl diff, %%eax            \n\t" // eax: x = offset to align. bdry
-// preload  "movl FullLength, %%edx      \n\t"
-            "cmpl %%edx, %%eax           \n\t"
-            "jnb paeth_dend              \n\t"
-
-            SAVE_ebp
-
-// preload  "movl row, %2                \n\t" // edi/rdi
-            // do Paeth decode for remaining bytes
-// preload  "movl prev_row, %1           \n\t" // esi/rsi
-            "movl %%eax, %%ebp           \n\t"
-// preload  "subl bpp, %%ebp             \n\t" // (bpp is preloaded into ecx)
-            "subl %%ecx, %%ebp           \n\t" // ebp = eax - bpp
-            "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx
-
-            SAVE_GOT_ebx
-            SAVE_r11_r12_r13
-
-         "paeth_dlp:                     \n\t"
-            "xorl %%ebx, %%ebx           \n\t"
-            // pav = p - a = (a + b - c) - a = b - c
-            "movb (%1," PAX ",), %%bl    \n\t" // load Prior(x) into bl
-            "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
-            "subl %%ecx, %%ebx           \n\t" // subtract Prior(x-bpp)
-            "movl %%ebx, " pa_TEMP "     \n\t" // Save pav for later use
-            "xorl %%ebx, %%ebx           \n\t"
-            // pbv = p - b = (a + b - c) - b = a - c
-            "movb (%2," PBP ",), %%bl    \n\t" // load Raw(x-bpp) into bl
-            "subl %%ecx, %%ebx           \n\t" // subtract Prior(x-bpp)
-            "movl %%ebx, %%ecx           \n\t"
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            "addl " pa_TEMP ", %%ebx     \n\t" // pcv = pav + pbv
-            // pc = abs(pcv)
-            "testl $0x80000000, %%ebx    \n\t"
-            "jz paeth_dpca               \n\t"
-            "negl %%ebx                  \n\t" // reverse sign of neg values
-
-         "paeth_dpca:                    \n\t"
-            "movl %%ebx, " pc_TEMP "     \n\t" // save pc for later use
-            // pb = abs(pbv)
-            "testl $0x80000000, %%ecx    \n\t"
-            "jz paeth_dpba               \n\t"
-            "negl %%ecx                  \n\t" // reverse sign of neg values
-
-         "paeth_dpba:                    \n\t"
-            "movl %%ecx, " pb_TEMP "     \n\t" // save pb for later use
-            // pa = abs(pav)
-            "movl " pa_TEMP ", %%ebx     \n\t"
-            "testl $0x80000000, %%ebx    \n\t"
-            "jz paeth_dpaa               \n\t"
-            "negl %%ebx                  \n\t" // reverse sign of neg values
-
-         "paeth_dpaa:                    \n\t"
-            "movl %%ebx, " pa_TEMP "     \n\t" // save pa for later use
-            // test if pa <= pb
-            "cmpl %%ecx, %%ebx           \n\t"
-            "jna paeth_dabb              \n\t"
-            // pa > pb; now test if pb <= pc
-            "cmpl " pc_TEMP ", %%ecx     \n\t"
-            "jna paeth_dbbc              \n\t"
-            // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-            "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
-            "jmp paeth_dpaeth            \n\t"
-
-         "paeth_dbbc:                    \n\t"
-            // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
-            "movb (%1," PAX ",), %%cl    \n\t" // load Prior(x) into cl
-            "jmp paeth_dpaeth            \n\t"
-
-         "paeth_dabb:                    \n\t"
-            // pa <= pb; now test if pa <= pc
-            "cmpl " pc_TEMP ", %%ebx     \n\t"
-            "jna paeth_dabc              \n\t"
-            // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-            "movb (%1," PBP ",), %%cl   \n\t" // load Prior(x-bpp) into cl
-            "jmp paeth_dpaeth            \n\t"
-
-         "paeth_dabc:                    \n\t"
-            // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
-            "movb (%2," PBP ",), %%cl    \n\t" // load Raw(x-bpp) into cl
-
-         "paeth_dpaeth:                  \n\t"
-            "incl %%eax                  \n\t"
-            "incl %%ebp                  \n\t"
-            // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
-            "addb %%cl, -1(%2," PAX ",)  \n\t"
-            "cmpl %%edx, %%eax           \n\t" // check against FullLength
-            "jb paeth_dlp                \n\t"
-
-            RESTORE_r11_r12_r13
-            RESTORE_GOT_ebx
-            RESTORE_ebp
-
-         "paeth_dend:                    \n\t"
-
-            : "=c" (dummy_value_c),            // output regs (dummy)
-              "=S" (dummy_value_S),
-              "=D" (dummy_value_D),
-              "=a" (dummy_value_a),
-              "=d" (dummy_value_d)
-
-            : "0" (bpp),         // ecx        // input regs
-              "1" (prev_row),    // esi/rsi
-              "2" (row),         // edi/rdi
-              "3" (diff),        // eax
-              "4" (FullLength)   // edx
-
-            CLOB_COLON_ebx_ebp_r1X             // clobber list
-              CLOBBER_GOT_ebx
-              CLOB_COMMA_ebx_ebp
-              CLOBBER_ebp
-              CLOB_COMMA_ebX_r1X
-              CLOBBER_r11_r12_r13
-         );
-      }
-      return; // end 1 or 2 bpp (no need to go further with this one)
-
-      case 6:
-      {
-//       _ActiveMask2 = 0xffffffff00000000LL;  // NOT USED ("_amask_0_4_4")
-//       _ShiftBpp = 48;       // bpp << 3 == bpp * 8
-//       _ShiftRem = 16;       // 64 - _ShiftBpp
-
-         __asm__ __volatile__ (
-// preload  "movl diff, %%ecx            \n\t"
-// preload  "movl row, %1                \n\t" // edi/rdi
-// preload  "movl prev_row, %0           \n\t" // esi/rsi
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PCX ",), %%mm1 \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-
-         "paeth_6lp:                     \n\t"
-            // must shift to position Raw(x-bpp) data
-            "psrlq $16, %%mm1            \n\t" // was _ShiftRem
-            // do first set of 4 bytes
-            "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
-            "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
-            "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
-            "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
-            // must shift to position Prior(x-bpp) data
-            "psrlq $16, %%mm3            \n\t" // was _ShiftRem
-            // pav = p - a = (a + b - c) - a = b - c
-            "movq %%mm2, %%mm4           \n\t"
-            "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
-            // pbv = p - b = (a + b - c) - b = a - c
-            "movq %%mm1, %%mm5           \n\t"
-            "psubw %%mm3, %%mm4          \n\t"
-            "pxor %%mm7, %%mm7           \n\t"
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            "movq %%mm4, %%mm6           \n\t"
-            "psubw %%mm3, %%mm5          \n\t"
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
-            "paddw %%mm5, %%mm6          \n\t"
-            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
-            "psubw %%mm0, %%mm4          \n\t"
-            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
-            "psubw %%mm0, %%mm4          \n\t"
-            "psubw %%mm7, %%mm5          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "psubw %%mm7, %%mm5          \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            //  test pa <= pb
-            "movq %%mm4, %%mm7           \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
-            "movq %%mm7, %%mm0           \n\t"
-            // use mm7 mask to merge pa & pb
-            "pand %%mm7, %%mm5           \n\t"
-            // use mm0 mask copy to merge a & b
-            "pand %%mm0, %%mm2           \n\t"
-            "pandn %%mm4, %%mm7          \n\t"
-            "pandn %%mm1, %%mm0          \n\t"
-            "paddw %%mm5, %%mm7          \n\t"
-            "paddw %%mm2, %%mm0          \n\t"
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
-            "pxor %%mm1, %%mm1           \n\t"
-            "pand %%mm7, %%mm3           \n\t"
-            "pandn %%mm0, %%mm7          \n\t"
-            "paddw %%mm3, %%mm7          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "packuswb %%mm1, %%mm7       \n\t"
-            "movq -8(%0," PCX ",), %%mm3 \n\t" // load c=Prior(x-bpp)
-            LOAD_GOT_rbp
-            "pand " AMASK4_4_0 ", %%mm7  \n\t" // _amask4_4_0 (was _ActiveMask)
-            RESTORE_rbp
-            "psrlq $16, %%mm3            \n\t"
-            "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x) step 1
-            "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
-            "movq %%mm2, %%mm6           \n\t"
-            "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
-            "movq -8(%1," PCX ",), %%mm1 \n\t"
-            "psllq $48, %%mm6            \n\t" // bpp * 8 = bits per pixel
-            "movq %%mm7, %%mm5           \n\t"
-            "psrlq $16, %%mm1            \n\t" // 64 - (bpp * 8) = remainder
-            "por %%mm6, %%mm3            \n\t"
-            "psllq $48, %%mm5            \n\t" // was _ShiftBpp
-            "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
-            "por %%mm5, %%mm1            \n\t"
-            // do second set of 4 bytes
-            "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
-            "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
-            // pav = p - a = (a + b - c) - a = b - c
-            "movq %%mm2, %%mm4           \n\t"
-            // pbv = p - b = (a + b - c) - b = a - c
-            "movq %%mm1, %%mm5           \n\t"
-            "psubw %%mm3, %%mm4          \n\t"
-            "pxor %%mm7, %%mm7           \n\t"
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            "movq %%mm4, %%mm6           \n\t"
-            "psubw %%mm3, %%mm5          \n\t"
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
-            "paddw %%mm5, %%mm6          \n\t"
-            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
-            "psubw %%mm0, %%mm4          \n\t"
-            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
-            "psubw %%mm0, %%mm4          \n\t"
-            "psubw %%mm7, %%mm5          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "psubw %%mm7, %%mm5          \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            //  test pa <= pb
-            "movq %%mm4, %%mm7           \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
-            "movq %%mm7, %%mm0           \n\t"
-            // use mm7 mask to merge pa & pb
-            "pand %%mm7, %%mm5           \n\t"
-            // use mm0 mask copy to merge a & b
-            "pand %%mm0, %%mm2           \n\t"
-            "pandn %%mm4, %%mm7          \n\t"
-            "pandn %%mm1, %%mm0          \n\t"
-            "paddw %%mm5, %%mm7          \n\t"
-            "paddw %%mm2, %%mm0          \n\t"
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
-            "pxor %%mm1, %%mm1           \n\t"
-            "pand %%mm7, %%mm3           \n\t"
-            "pandn %%mm0, %%mm7          \n\t"
-            "pxor %%mm1, %%mm1           \n\t"
-            "paddw %%mm3, %%mm7          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            // step ecx to next set of 8 bytes and repeat loop til done
-            "addl $8, %%ecx              \n\t"
-            "packuswb %%mm7, %%mm1       \n\t"
-            "paddb -8(%1," PCX ",), %%mm1 \n\t" // add Paeth predictor + Raw(x)
-            "cmpl %%eax, %%ecx           \n\t" // MMXLength
-            "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
-                                 // mm1 will be used as Raw(x-bpp) next loop
-            "jb paeth_6lp                \n\t"
-
-            : "=S" (dummy_value_S),            // output regs (dummy)
-              "=D" (dummy_value_D),
-              "=c" (dummy_value_c),
-              "=a" (dummy_value_a)
-
-            : "0" (prev_row),  // esi/rsi      // input regs
-              "1" (row),       // edi/rdi
-              "2" (diff),      // ecx
-              "3" (MMXLength)  // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
-            , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-         );
-      }
-      break;  // end 6 bpp
-
-      case 8:                          // bpp == 8
-      {
-         __asm__ __volatile__ (
-// preload  "movl diff, %%ecx            \n\t"
-// preload  "movl row, %1                \n\t" // edi/rdi
-// preload  "movl prev_row, %0           \n\t" // esi/rsi
-            "pxor %%mm0, %%mm0           \n\t"
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PCX ",), %%mm1 \n\t" // only time should need to read
-                                               //  a=Raw(x-bpp) bytes
-         "paeth_8lp:                     \n\t"
-            // do first set of 4 bytes
-            "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
-            "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
-            "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
-            "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
-            // pav = p - a = (a + b - c) - a = b - c
-            "movq %%mm2, %%mm4           \n\t"
-            "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
-            // pbv = p - b = (a + b - c) - b = a - c
-            "movq %%mm1, %%mm5           \n\t"
-            "psubw %%mm3, %%mm4          \n\t"
-            "pxor %%mm7, %%mm7           \n\t"
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            "movq %%mm4, %%mm6           \n\t"
-            "psubw %%mm3, %%mm5          \n\t"
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
-            "paddw %%mm5, %%mm6          \n\t"
-            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
-            "psubw %%mm0, %%mm4          \n\t"
-            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
-            "psubw %%mm0, %%mm4          \n\t"
-            "psubw %%mm7, %%mm5          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "psubw %%mm7, %%mm5          \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            //  test pa <= pb
-            "movq %%mm4, %%mm7           \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
-            "movq %%mm7, %%mm0           \n\t"
-            // use mm7 mask to merge pa & pb
-            "pand %%mm7, %%mm5           \n\t"
-            // use mm0 mask copy to merge a & b
-            "pand %%mm0, %%mm2           \n\t"
-            "pandn %%mm4, %%mm7          \n\t"
-            "pandn %%mm1, %%mm0          \n\t"
-            "paddw %%mm5, %%mm7          \n\t"
-            "paddw %%mm2, %%mm0          \n\t"
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
-            "pxor %%mm1, %%mm1           \n\t"
-            "pand %%mm7, %%mm3           \n\t"
-            "pandn %%mm0, %%mm7          \n\t"
-            "paddw %%mm3, %%mm7          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "packuswb %%mm1, %%mm7       \n\t"
-            "movq -8(%0," PCX ",), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
-            LOAD_GOT_rbp
-            "pand " AMASK4_4_0 ", %%mm7  \n\t" // _amask4_4_0 (was _ActiveMask)
-            RESTORE_rbp
-            "movq (%0," PCX ",), %%mm2   \n\t" // load b=Prior(x)
-            "paddb (%1," PCX ",), %%mm7  \n\t" // add Paeth predictor + Raw(x)
-            "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
-            "movq %%mm7, (%1," PCX ",)   \n\t" // write back updated value
-            "movq -8(%1," PCX ",), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
-
-            // do second set of 4 bytes
-            "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
-            "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
-            // pav = p - a = (a + b - c) - a = b - c
-            "movq %%mm2, %%mm4           \n\t"
-            // pbv = p - b = (a + b - c) - b = a - c
-            "movq %%mm1, %%mm5           \n\t"
-            "psubw %%mm3, %%mm4          \n\t"
-            "pxor %%mm7, %%mm7           \n\t"
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            "movq %%mm4, %%mm6           \n\t"
-            "psubw %%mm3, %%mm5          \n\t"
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
-            "paddw %%mm5, %%mm6          \n\t"
-            "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
-            "psubw %%mm0, %%mm4          \n\t"
-            "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
-            "psubw %%mm0, %%mm4          \n\t"
-            "psubw %%mm7, %%mm5          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
-            "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
-            "psubw %%mm7, %%mm5          \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            //  test pa <= pb
-            "movq %%mm4, %%mm7           \n\t"
-            "psubw %%mm0, %%mm6          \n\t"
-            "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
-            "movq %%mm7, %%mm0           \n\t"
-            // use mm7 mask to merge pa & pb
-            "pand %%mm7, %%mm5           \n\t"
-            // use mm0 mask copy to merge a & b
-            "pand %%mm0, %%mm2           \n\t"
-            "pandn %%mm4, %%mm7          \n\t"
-            "pandn %%mm1, %%mm0          \n\t"
-            "paddw %%mm5, %%mm7          \n\t"
-            "paddw %%mm2, %%mm0          \n\t"
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
-            "pxor %%mm1, %%mm1           \n\t"
-            "pand %%mm7, %%mm3           \n\t"
-            "pandn %%mm0, %%mm7          \n\t"
-            "pxor %%mm1, %%mm1           \n\t"
-            "paddw %%mm3, %%mm7          \n\t"
-            "pxor %%mm0, %%mm0           \n\t"
-            // step ecx to next set of 8 bytes and repeat loop til done
-            "addl $8, %%ecx              \n\t"
-            "packuswb %%mm7, %%mm1       \n\t"
-            "paddb -8(%1," PCX ",), %%mm1 \n\t" // add Paeth predictor + Raw(x)
-            "cmpl %%eax, %%ecx           \n\t" // MMXLength
-            "movq %%mm1, -8(%1," PCX ",) \n\t" // write back updated value
-                                 // mm1 will be used as Raw(x-bpp) next loop
-            "jb paeth_8lp                \n\t"
-
-            : "=S" (dummy_value_S),            // output regs (dummy)
-              "=D" (dummy_value_D),
-              "=c" (dummy_value_c),
-              "=a" (dummy_value_a)
-
-            : "0" (prev_row),  // esi/rsi      // input regs
-              "1" (row),       // edi/rdi
-              "2" (diff),      // ecx
-              "3" (MMXLength)  // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1", "%mm2", "%mm3"   // clobber list
-            , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-         );
-      }
-      break;  // end 8 bpp
-
-      default:                // bpp != 1,2,3,4,6,8:  doesn't exist
-      {
-         // ERROR:  SHOULD NEVER BE REACHED
-#if defined(PNG_DEBUG)
-         png_debug(1, "Internal libpng logic error (GCC "
-           "png_read_filter_row_mmx_paeth())\n");
-#endif
-      }
-      break;
-
-   } // end switch (bpp)
-
-   __asm__ __volatile__ (
-      // MMX acceleration complete; now do clean-up
-      // check if any remaining bytes left to decode
-//pre "movl FullLength, %%edx      \n\t"
-//pre "movl MMXLength, %%eax       \n\t"
-      "cmpl %%edx, %%eax           \n\t"
-      "jnb paeth_end               \n\t"
-
-      SAVE_ebp
-
-//pre "movl row, %2                \n\t" // edi/rdi
-//pre "movl prev_row, %1           \n\t" // esi/rsi
-      // do Paeth decode for remaining bytes
-      "movl %%eax, %%ebp           \n\t"
-//pre "subl bpp, %%ebp             \n\t" // (bpp is preloaded into ecx)
-      "subl %%ecx, %%ebp           \n\t" // ebp = eax - bpp
-      "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
-
-      SAVE_GOT_ebx
-      SAVE_r11_r12_r13
-
-   "paeth_lp2:                     \n\t"
-      "xorl %%ebx, %%ebx           \n\t"
-      // pav = p - a = (a + b - c) - a = b - c
-      "movb (%1," PAX ",), %%bl    \n\t" // load Prior(x) into bl
-      "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
-      "subl %%ecx, %%ebx           \n\t" // subtract Prior(x-bpp)
-      "movl %%ebx, " pa_TEMP "     \n\t" // Save pav for later use
-      "xorl %%ebx, %%ebx           \n\t"
-      // pbv = p - b = (a + b - c) - b = a - c
-      "movb (%2," PBP ",), %%bl    \n\t" // load Raw(x-bpp) into bl
-      "subl %%ecx, %%ebx           \n\t" // subtract Prior(x-bpp)
-      "movl %%ebx, %%ecx           \n\t"
-      // pcv = p - c = (a + b - c) - c = (a - c) + (b - c) = pav + pbv
-      "addl " pa_TEMP ", %%ebx     \n\t" // pcv = pav + pbv
-      // pc = abs(pcv)
-      "testl $0x80000000, %%ebx    \n\t"
-      "jz paeth_pca2               \n\t"
-      "negl %%ebx                  \n\t" // reverse sign of neg values
-
-   "paeth_pca2:                    \n\t"
-      "movl %%ebx, " pc_TEMP "     \n\t" // save pc for later use
-      // pb = abs(pbv)
-      "testl $0x80000000, %%ecx    \n\t"
-      "jz paeth_pba2               \n\t"
-      "negl %%ecx                  \n\t" // reverse sign of neg values
-
-   "paeth_pba2:                    \n\t"
-      "movl %%ecx, " pb_TEMP "     \n\t" // save pb for later use
-      // pa = abs(pav)
-      "movl " pa_TEMP ", %%ebx     \n\t"
-      "testl $0x80000000, %%ebx    \n\t"
-      "jz paeth_paa2               \n\t"
-      "negl %%ebx                  \n\t" // reverse sign of neg values
-
-   "paeth_paa2:                    \n\t"
-      "movl %%ebx, " pa_TEMP "     \n\t" // save pa for later use
-      // test if pa <= pb
-      "cmpl %%ecx, %%ebx           \n\t"
-      "jna paeth_abb2              \n\t"
-      // pa > pb; now test if pb <= pc
-      "cmpl " pc_TEMP ", %%ecx     \n\t"
-      "jna paeth_bbc2              \n\t"
-      // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-      "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
-      "jmp paeth_paeth2            \n\t"
-
-   "paeth_bbc2:                    \n\t"
-      // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
-      "movb (%1," PAX ",), %%cl    \n\t" // load Prior(x) into cl
-      "jmp paeth_paeth2            \n\t"
-
-   "paeth_abb2:                    \n\t"
-      // pa <= pb; now test if pa <= pc
-      "cmpl " pc_TEMP ", %%ebx     \n\t"
-      "jna paeth_abc2              \n\t"
-      // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-      "movb (%1," PBP ",), %%cl    \n\t" // load Prior(x-bpp) into cl
-      "jmp paeth_paeth2            \n\t"
-
-   "paeth_abc2:                    \n\t"
-      // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
-      "movb (%2," PBP ",), %%cl    \n\t" // load Raw(x-bpp) into cl
-
-   "paeth_paeth2:                  \n\t"
-      "incl %%eax                  \n\t"
-      "incl %%ebp                  \n\t"
-      // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
-      "addb %%cl, -1(%2," PAX ",)  \n\t"
-      "cmpl %%edx, %%eax           \n\t" // check against FullLength
-      "jb paeth_lp2                \n\t"
-
-      RESTORE_r11_r12_r13
-      RESTORE_GOT_ebx
-      RESTORE_ebp
-
-   "paeth_end:                     \n\t"
-      "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
-
-      : "=c" (dummy_value_c),            // output regs (dummy)
-        "=S" (dummy_value_S),
-        "=D" (dummy_value_D),
-        "=a" (dummy_value_a),
-        "=d" (dummy_value_d)
-
-      : "0" (bpp),         // ecx        // input regs
-        "1" (prev_row),    // esi/rsi
-        "2" (row),         // edi/rdi
-        "3" (MMXLength),   // eax
-        "4" (FullLength)   // edx
-
-      CLOB_COLON_ebx_ebp_r1X             // clobber list
-        CLOBBER_GOT_ebx
-        CLOB_COMMA_ebx_ebp
-        CLOBBER_ebp
-        CLOB_COMMA_ebX_r1X
-        CLOBBER_r11_r12_r13
-   );
-
-} /* end png_read_filter_row_mmx_paeth() */
-
-#endif // PNG_x86_64_USE_GOTPCREL || PNG_THREAD_UNSAFE_OK
-#endif /* PNG_MMX_READ_FILTER_PAETH_SUPPORTED */
-
-
-
-
-#if defined(PNG_MMX_READ_FILTER_SUB_SUPPORTED)
-
-//===========================================================================//
-//                                                                           //
-//           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B           //
-//                                                                           //
-//===========================================================================//
-
-// Optimized code for PNG Sub filter decoder
-
-static void /* PRIVATE */
-png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
-{
-   unsigned FullLength, MMXLength;  // png_uint_32 is actually 64-bit on x86-64
-   int bpp;
-   int dummy_value_a;
-   int dummy_value_c;
-   int dummy_value_d;
-   png_bytep dummy_value_D;
-   int diff; //     __attribute__((used));
-
-   bpp = (row_info->pixel_depth + 7) >> 3;  // calc number of bytes per pixel
-   FullLength = row_info->rowbytes - bpp;   // number of bytes to filter
-     // (why do we subtract off bpp?  not so in avg or paeth...)
-
-   __asm__ __volatile__ (
-      SAVE_r15
-      SAVE_ebp
-//pre "movl row, %1                \n\t" // edi/rdi
-      "mov  %1, " PSI "            \n\t" // lp = row
-//pre "movl bpp, %%ecx             \n\t"
-      "add  " PCX ", %1            \n\t" // rp = row + bpp
-//pre "movl FullLength, %%eax      \n\t" // bring in via eax...
-      SAVE_FullLength                    // ...but store for later use
-
-      "xorl %%eax, %%eax           \n\t"
-
-      // get # of bytes to alignment (note:  computing _delta_ of two pointers,
-      // so hereafter %%ebp is sufficient even on 64-bit)
-      "mov  %1, " PBP "            \n\t" // take start of row
-      "add  $0xf, " PBP "          \n\t" // add 7+8 to incr past alignment bdry
-//    "andl $0xfffffff8, %%ebp     \n\t" // mask to alignment boundary (32-bit!)
-      CLEAR_BOTTOM_3_BITS  PBP    "\n\t" // mask to alignment boundary
-      "sub  %1, " PBP "            \n\t" // subtract row ptr again => ebp =
-      "jz sub_go                   \n\t" //  target value of eax at alignment
-
-   "sub_lp1:                       \n\t" // fix alignment
-      "movb (" PSI "," PAX ",), %%cl \n\t"
-      "addb %%cl, (%1," PAX ",)    \n\t"
-      "incl %%eax                  \n\t"
-      "cmpl %%ebp, %%eax           \n\t"
-      "jb sub_lp1                  \n\t"
-
-   "sub_go:                        \n\t"
-      RESTORE_FullLength "%%ecx    \n\t" // FullLength -> ecx
-      "movl %%ecx, %%edx           \n\t"
-      "subl %%eax, %%edx           \n\t" // subtract alignment fix
-      "andl $0x00000007, %%edx     \n\t" // calc bytes over mult of 8
-      "subl %%edx, %%ecx           \n\t" // drop over bytes from length
-//out "movl %%ecx, MMXLength       \n\t"
-      "movl %%ebp, %%eax           \n\t" // ebp = diff, but no reg constraint(?)
-      RESTORE_ebp                        //  (could swap ebp and ecx functions,
-      RESTORE_r15                        //  but %%cl issues...)
-
-      : "=c" (MMXLength),       // 0     // output regs
-        "=D" (dummy_value_D),   // 1
-        "=a" (diff)             // 2
-
-      : "0" (bpp),              // ecx   // input regs
-        "1" (row),              // edi
-        "2" (FullLength)        // eax
-
-      : "%esi", "%edx"                   // clobber list
-        _CLOBBER_r15
-        _CLOBBER_ebp
-   );
-
-   // now do the math for the rest of the row
-   switch (bpp)
-   {
-      case 3:
-      {
-//       _ShiftBpp = 24;       // == 3 * 8
-//       _ShiftRem  = 40;      // == 64 - 24
-
-         __asm__ __volatile__ (
-// preload  "mov  row, %1                 \n\t" // edi/rdi
-            LOAD_GOT_rbp
-            // load (former) _ActiveMask for 2nd active byte group
-            "movq " AMASK2_3_3 ", %%mm7   \n\t" // _amask2_3_3
-            RESTORE_rbp
-
-// notused  "mov  %1, " PSI "             \n\t" // lp = row
-// preload  "movl bpp, %%ecx              \n\t"
-            "add  " PCX ", %1             \n\t" // rp = row + bpp
-            "movq %%mm7, %%mm6            \n\t"
-// preload  "movl diff, %%edx             \n\t"
-            "psllq $24, %%mm6             \n\t" // move mask in mm6 to cover
-                                                //  3rd active byte group
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PDX ",), %%mm1  \n\t"
-
-         "sub_3lp:                        \n\t" // shift data for adding first
-            "psrlq $40, %%mm1             \n\t" //  bpp bytes (no need for mask;
-                                                //  shift clears inactive bytes)
-            // add 1st active group
-            "movq (%1," PDX ",), %%mm0    \n\t"
-            "paddb %%mm1, %%mm0           \n\t"
-
-            // add 2nd active group
-            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
-            "psllq $24, %%mm1             \n\t" // shift data to pos. correctly
-            "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
-            "paddb %%mm1, %%mm0           \n\t"
-
-            // add 3rd active group
-            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
-            "psllq $24, %%mm1             \n\t" // shift data to pos. correctly
-            "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
-            "addl $8, %%edx               \n\t"
-            "paddb %%mm1, %%mm0           \n\t"
-
-            "cmpl %%eax, %%edx            \n\t" // MMXLength
-            "movq %%mm0, -8(%1," PDX ",)  \n\t" // write updated Raws to array
-            "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
-            "jb sub_3lp                   \n\t"
-
-            : "=c" (dummy_value_c),   // 0      // output regs (dummy)
-              "=D" (dummy_value_D),   // 1
-              "=d" (dummy_value_d),   // 2
-              "=a" (dummy_value_a)    // 3
-
-            : "0" (bpp),              // ecx    // input regs
-              "1" (row),              // edi
-              "2" (diff),             // edx
-              "3" (MMXLength)         // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1", "%mm6", "%mm7"    // clobber list
-#endif
-         );
-      }
-      break;  // end 3 bpp
-
-      case 4:   // formerly shared with 6 bpp case via _ShiftBpp and _ShiftRem,
-      {         // but 64-bit PIC/.so problems (could still share, moving vars
-                // into unused MMX regs via ecx/edx, but kludgy)
-//       _ShiftBpp = bpp << 3;        // 32 (psllq)
-//       _ShiftRem = 64 - _ShiftBpp;  // 32 (psrlq)
-
-         __asm__ __volatile__ (
-// preload  "mov  row, %1                 \n\t" // edi/rdi
-// preload  "movl diff, %%edx             \n\t"
-// notused  "mov  %1, " PSI "             \n\t" // lp = row
-// preload  "movl bpp, %%ecx              \n\t"
-            "add  " PCX ", %1             \n\t" // rp = row + bpp
-
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PDX ",), %%mm1  \n\t"
-
-         "sub_4lp:                        \n\t" // shift data for adding first
-            "psrlq $32, %%mm1             \n\t" //  bpp bytes (no need for mask;
-                                                //  shift clears inactive bytes)
-            "movq (%1," PDX ",), %%mm0    \n\t"
-            "paddb %%mm1, %%mm0           \n\t"
-
-            // add 2nd active group
-            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
-            "psllq $32, %%mm1             \n\t" // shift data to pos. correctly
-            "addl $8, %%edx               \n\t"
-            "paddb %%mm1, %%mm0           \n\t"
-
-            "cmpl %%eax, %%edx            \n\t" // MMXLength
-            "movq %%mm0, -8(%1," PDX ",)  \n\t" // write updated Raws to array
-            "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
-            "jb sub_4lp                   \n\t"
-
-            : "=c" (dummy_value_c),   // 0      // output regs (dummy)
-              "=D" (dummy_value_D),   // 1
-              "=d" (dummy_value_d),   // 2
-              "=a" (dummy_value_a)    // 3
-
-            : "0" (bpp),              // ecx    // input regs
-              "1" (row),              // edi
-              "2" (diff),             // edx
-              "3" (MMXLength)         // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1"                    // clobber list
-#endif
-         );
-      }
-      break;  // end 4 bpp
-
-      case 1:
-      {
-         __asm__ __volatile__ (
-// preload  "movl diff, %%edx              \n\t"
-// preload  "mov  row, %1                  \n\t" // edi/rdi
-// preload  "cmpl FullLength, %%edx        \n\t"
-            "cmpl %%eax, %%edx             \n\t"
-            "jnb sub_1end                  \n\t"
-            "mov  %1, " PSI "              \n\t" // lp = row
-// irrel.   "xorl %%ecx, %%ecx             \n\t" // (actually bug with preload)
-// preload  "movl bpp, %%ecx               \n\t"
-            "add  " PCX ", %1              \n\t" // rp = row + bpp
-
-         "sub_1lp:                         \n\t"
-            "movb (" PSI "," PDX ",), %%cl \n\t"
-            "addb %%cl, (%1," PDX ",)      \n\t"
-            "incl %%edx                    \n\t"
-            "cmpl %%eax, %%edx             \n\t" // compare with FullLength
-            "jb sub_1lp                    \n\t"
-
-         "sub_1end:                        \n\t"
-
-            : "=c" (dummy_value_c),   // 0      // output regs (dummy)
-              "=D" (dummy_value_D),   // 1
-              "=d" (dummy_value_d),   // 2
-              "=a" (dummy_value_a)    // 3
-
-            : "0" (bpp),              // ecx    // input regs
-              "1" (row),              // edi
-              "2" (diff),             // edx
-              "3" (FullLength)        // eax
-
-            : "%esi"                            // clobber list
-         );
-      }
-      return;  // end 1 bpp (bypassing cleanup block!)
-
-      case 2:
-      {
-//       _ShiftBpp = 16;       // == 2 * 8
-//       _ShiftRem = 48;       // == 64 - 16
-
-         __asm__ __volatile__ (
-            LOAD_GOT_rbp
-            // load (former) _ActiveMask for 2nd active byte group
-            "movq " AMASK4_2_2 ", %%mm7   \n\t" // _amask4_2_2
-            RESTORE_rbp
-// preload  "movl diff, %%edx             \n\t"
-            "movq %%mm7, %%mm6            \n\t"
-// preload  "mov  row, %1                 \n\t" // edi/rdi
-            "psllq $16, %%mm6             \n\t" // move mask in mm6 to cover
-                                                //  3rd active byte group
-// notused  "mov  %1, " PSI "             \n\t" // lp = row
-            "movq %%mm6, %%mm5            \n\t"
-// preload  "movl bpp, %%ecx              \n\t"
-            "add  " PCX ", %1             \n\t" // rp = row + bpp
-            "psllq $16, %%mm5             \n\t" // move mask in mm5 to cover
-                                                //  4th active byte group
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PDX ",), %%mm1  \n\t"
-
-         "sub_2lp:                        \n\t" // shift data for adding first
-            "psrlq $48, %%mm1             \n\t" //  bpp bytes (no need for mask;
-                                                //  shift clears inactive bytes)
-            // add 1st active group
-            "movq (%1," PDX ",), %%mm0    \n\t"
-            "paddb %%mm1, %%mm0           \n\t"
-
-            // add 2nd active group
-            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
-            "psllq $16, %%mm1             \n\t" // shift data to pos. correctly
-            "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
-            "paddb %%mm1, %%mm0           \n\t"
-
-            // add 3rd active group
-            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
-            "psllq $16, %%mm1             \n\t" // shift data to pos. correctly
-            "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
-            "paddb %%mm1, %%mm0           \n\t"
-
-            // add 4th active group
-            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
-            "psllq $16, %%mm1             \n\t" // shift data to pos. correctly
-            "pand %%mm5, %%mm1            \n\t" // mask to use 4th active group
-            "addl $8, %%edx               \n\t"
-            "paddb %%mm1, %%mm0           \n\t"
-            "cmpl %%eax, %%edx            \n\t" // MMXLength
-            "movq %%mm0, -8(%1," PDX ",)  \n\t" // write updated Raws to array
-            "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
-            "jb sub_2lp                   \n\t"
-
-            : "=c" (dummy_value_c),   // 0      // output regs (dummy)
-              "=D" (dummy_value_D),   // 1
-              "=d" (dummy_value_d),   // 2
-              "=a" (dummy_value_a)    // 3
-
-            : "0" (bpp),              // ecx    // input regs
-              "1" (row),              // edi
-              "2" (diff),             // edx
-              "3" (MMXLength)         // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1", "%mm5", "%mm6"    // clobber list
-            , "%mm7"
-#endif
-         );
-      }
-      break;  // end 2 bpp
-
-      case 6:   // formerly shared with 4 bpp case (see comments there)
-      {
-//       _ShiftBpp = bpp << 3;        // 48 (psllq)
-//       _ShiftRem = 64 - _ShiftBpp;  // 16 (psrlq)
-
-         __asm__ __volatile__ (
-// preload  "mov  row, %1                 \n\t" // edi/rdi
-// preload  "movl diff, %%edx             \n\t"
-// notused  "mov  %1, " PSI "             \n\t" // lp = row
-// preload  "movl bpp, %%ecx              \n\t"
-            "add  " PCX ", %1             \n\t" // rp = row + bpp
-
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PDX ",), %%mm1  \n\t"
-
-         "sub_6lp:                        \n\t" // shift data for adding first
-            "psrlq $16, %%mm1             \n\t" //  bpp bytes (no need for mask;
-                                                //  shift clears inactive bytes)
-            "movq (%1," PDX ",), %%mm0    \n\t"
-            "paddb %%mm1, %%mm0           \n\t"
-
-            // add 2nd active group
-            "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
-            "psllq $48, %%mm1             \n\t" // shift data to pos. correctly
-            "addl $8, %%edx               \n\t"
-            "paddb %%mm1, %%mm0           \n\t"
-
-            "cmpl %%eax, %%edx            \n\t" // MMXLength
-            "movq %%mm0, -8(%1," PDX ",)  \n\t" // write updated Raws to array
-            "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
-            "jb sub_6lp                   \n\t"
-
-            : "=c" (dummy_value_c),   // 0      // output regs (dummy)
-              "=D" (dummy_value_D),   // 1
-              "=d" (dummy_value_d),   // 2
-              "=a" (dummy_value_a)    // 3
-
-            : "0" (bpp),              // ecx    // input regs
-              "1" (row),              // edi
-              "2" (diff),             // edx
-              "3" (MMXLength)         // eax
-
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            : "%mm0", "%mm1"                    // clobber list
-#endif
-         );
-      }
-      break;  // end 6 bpp
-
-      case 8:
-      {
-         __asm__ __volatile__ (
-// preload  "mov  row, %1                 \n\t" // edi/rdi
-// preload  "movl diff, %%edx             \n\t"
-// notused  "mov  %1, " PSI "             \n\t" // lp = row
-// preload  "movl bpp, %%ecx              \n\t"
-            "add  " PCX ", %1             \n\t" // rp = row + bpp
-// preload  "movl MMXLength, %%eax        \n\t"
-
-            // prime the pump:  load the first Raw(x-bpp) data set
-            "movq -8(%1," PDX ",), %%mm7  \n\t"
-            "movl %%eax, %%esi            \n\t" // copy of MMXLength -> esi
-            "andl $0x0000003f, %%esi      \n\t" // calc bytes over mult of 64
-
-         "sub_8lp:                        \n\t"
-            "movq (%1," PDX ",), %%mm0    \n\t" // load Sub(x) for 1st 8 bytes
-            "paddb %%mm7, %%mm0           \n\t"
-            "movq 8(%1," PDX ",), %%mm1   \n\t" // load Sub(x) for 2nd 8 bytes
-            "movq %%mm0, (%1," PDX ",)    \n\t" // write Raw(x) for 1st 8 bytes
-
-            // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
-            // This will be repeated for each group of 8 bytes with the 8th
-            // group being used as the Raw(x-bpp) for the 1st group of the
-            // next loop.
-
-            "paddb %%mm0, %%mm1           \n\t"
-            "movq 16(%1," PDX ",), %%mm2  \n\t" // load Sub(x) for 3rd 8 bytes
-            "movq %%mm1, 8(%1," PDX ",)   \n\t" // write Raw(x) for 2nd 8 bytes
-            "paddb %%mm1, %%mm2           \n\t"
-            "movq 24(%1," PDX ",), %%mm3  \n\t" // load Sub(x) for 4th 8 bytes
-            "movq %%mm2, 16(%1," PDX ",)  \n\t" // write Raw(x) for 3rd 8 bytes
-            "paddb %%mm2, %%mm3           \n\t"
-            "movq 32(%1," PDX ",), %%mm4  \n\t" // load Sub(x) for 5th 8 bytes
-            "movq %%mm3, 24(%1," PDX ",)  \n\t" // write Raw(x) for 4th 8 bytes
-            "paddb %%mm3, %%mm4           \n\t"
-            "movq 40(%1," PDX ",), %%mm5  \n\t" // load Sub(x) for 6th 8 bytes
-            "movq %%mm4, 32(%1," PDX ",)  \n\t" // write Raw(x) for 5th 8 bytes
-            "paddb %%mm4, %%mm5           \n\t"
-            "movq 48(%1," PDX ",), %%mm6  \n\t" // load Sub(x) for 7th 8 bytes
-            "movq %%mm5, 40(%1," PDX ",)  \n\t" // write Raw(x) for 6th 8 bytes
-            "paddb %%mm5, %%mm6           \n\t"
-            "movq 56(%1," PDX ",), %%mm7  \n\t" // load Sub(x) for 8th 8 bytes
-            "movq %%mm6, 48(%1," PDX ",)  \n\t" // write Raw(x) for 7th 8 bytes
-            "addl $64, %%edx              \n\t"
-            "paddb %%mm6, %%mm7           \n\t"
-            "cmpl %%esi, %%edx            \n\t" // cmp to bytes over mult of 64
-            "movq %%mm7, -8(%1," PDX ",)  \n\t" // write Raw(x) for 8th 8 bytes
-            "jb sub_8lp                   \n\t"
-
-            "cmpl %%eax, %%edx            \n\t" // compare to MMXLength
-            "jnb sub_8lt8                 \n\t"
-
-         "sub_8lpA:                       \n\t"
-            "movq (%1," PDX ",), %%mm0    \n\t"
-            "addl $8, %%edx               \n\t"
-            "paddb %%mm7, %%mm0           \n\t"
-            "cmpl %%eax, %%edx            \n\t" // compare to MMXLength
-            "movq %%mm0, -8(%1," PDX ",)  \n\t" // -8 to offset early addl edx
-            "movq %%mm0, %%mm7            \n\t" // move calculated Raw(x) data
-            "jb sub_8lpA                  \n\t" //  to mm7 to be new Raw(x-bpp)
-                                                //  for next loop
-         "sub_8lt8:                       \n\t"
-
-            : "=c" (dummy_value_c),   // 0      // output regs (dummy)
-              "=D" (dummy_value_D),   // 1
-              "=d" (dummy_value_d),   // 2
-              "=a" (dummy_value_a)    // 3
-
-            : "0" (bpp),              // ecx    // input regs
-              "1" (row),              // edi
-              "2" (diff),             // edx
-              "3" (MMXLength)         // eax
-
-            : "%esi"                            // clobber list
-#if defined(CLOBBER_MMX_REGS_SUPPORTED)
-            , "%mm0", "%mm1", "%mm2", "%mm3"
-            , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-         );
-      }
-      break;  // end 8 bpp
-
-      default:                // bpp != 1,2,3,4,6,8:  doesn't exist
-      {
-         // ERROR:  SHOULD NEVER BE REACHED
-#if defined(PNG_DEBUG)
-         png_debug(1, "Internal libpng logic error (GCC "
-           "png_read_filter_row_mmx_sub())\n");
-#endif
-      }
-      break;
-
-   } // end switch (bpp)
-
-   __asm__ __volatile__ (
-//pre "movl MMXLength, %%eax         \n\t"
-//pre "mov  row, %1                  \n\t" // edi/rdi
-//pre "cmpl FullLength, %%eax        \n\t"
-      "cmpl %%edx, %%eax             \n\t"
-      "jnb sub_end                   \n\t"
-
-      "mov  %1, " PSI "              \n\t" // lp = row
-//pre "movl bpp, %%ecx               \n\t"
-      "add  " PCX ", %1              \n\t" // rp = row + bpp
-      "xorl %%ecx, %%ecx             \n\t"
-
-   "sub_lp2:                         \n\t"
-      "movb (" PSI "," PAX ",), %%cl \n\t"
-      "addb %%cl, (%1," PAX ",)      \n\t"
-      "incl %%eax                    \n\t"
-      "cmpl %%edx, %%eax             \n\t" // FullLength
-      "jb sub_lp2                    \n\t"
-
-   "sub_end:                         \n\t"
-      "EMMS                          \n\t" // end MMX instructions
-
-      : "=c" (dummy_value_c),   // 0      // output regs (dummy)
-        "=D" (dummy_value_D),   // 1
-        "=a" (dummy_value_a),   // 2
-        "=d" (dummy_value_d)    // 3
-
-      : "0" (bpp),              // ecx    // input regs
-        "1" (row),              // edi
-        "2" (MMXLength),        // eax
-        "3" (FullLength)        // edx
-
-      : "%esi"                            // clobber list
-   );
-
-} // end of png_read_filter_row_mmx_sub()
-
-#endif /* PNG_MMX_READ_FILTER_SUB_SUPPORTED */
-
-
-
-
-#if defined(PNG_MMX_READ_FILTER_UP_SUPPORTED)
-
-//===========================================================================//
-//                                                                           //
-//            P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P            //
-//                                                                           //
-//===========================================================================//
-
-// Optimized code for PNG Up filter decoder
-
-static void /* PRIVATE */
-png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
-                           png_bytep prev_row)
-{
-   unsigned len;        // png_uint_32 is actually 64-bit on x86-64
-   int dummy_value_d;   // fix 'forbidden register 3 (dx) was spilled' error
-   png_bytep dummy_value_S;
-   png_bytep dummy_value_D;
-
-   len = row_info->rowbytes;              // number of bytes to filter
-
-   __asm__ __volatile__ (
-      SAVE_GOT_ebx
-//pre "mov  prev_row, %1           \n\t" // esi/rsi
-//pre "movl row, %2                \n\t" // edi/rdi
-
-      "xorl %%ebx, %%ebx           \n\t"
-      "xorl %%eax, %%eax           \n\t"
-
-      // get # of bytes to alignment (note:  computing _delta_ of two pointers,
-      // so hereafter %%ecx is sufficient even on 64-bit)
-      "mov  %2, " PCX "            \n\t" // take start of row
-      "add  $0x7, " PCX "          \n\t" // add 7 to incr past alignment bdry
-//    "andl $0xfffffff8, %%ecx     \n\t" // mask to alignment boundary (32-bit!)
-      CLEAR_BOTTOM_3_BITS  PCX    "\n\t" // mask to alignment boundary
-      "sub  %2, " PCX "            \n\t" // subtract row ptr again => ebp =
-      "jz up_go                    \n\t" //  target value of ecx at alignment
-
-   "up_lp1:                        \n\t" // fix alignment
-      "movb (%2," PBX ",), %%al    \n\t"
-      "addb (%1," PBX ",), %%al    \n\t"
-      "incl %%ebx                  \n\t"
-      "cmpl %%ecx, %%ebx           \n\t"
-      "movb %%al, -1(%2," PBX ",)  \n\t" // mov does not affect flags; -1 to
-      "jb up_lp1                   \n\t" //  offset incl ebx
-
-   "up_go:                         \n\t"
-//pre "movl len, %%edx             \n\t"
-      "movl %%edx, %%ecx           \n\t"
-      "subl %%ebx, %%edx           \n\t" // subtract alignment fix
-      "andl $0x0000003f, %%edx     \n\t" // calc bytes over mult of 64
-      "subl %%edx, %%ecx           \n\t" // sub over-bytes from original length
-
-      // unrolled loop - use all MMX registers and interleave to reduce
-      // number of branch instructions (loops) and reduce partial stalls
-   "up_loop:                       \n\t"
-      "movq (%1," PBX ",), %%mm1   \n\t"
-      "movq (%2," PBX ",), %%mm0   \n\t"
-      "movq 8(%1," PBX ",), %%mm3  \n\t"
-      "paddb %%mm1, %%mm0          \n\t"
-      "movq 8(%2," PBX ",), %%mm2  \n\t"
-      "movq %%mm0, (%2," PBX ",)   \n\t"
-      "paddb %%mm3, %%mm2          \n\t"
-      "movq 16(%1," PBX ",), %%mm5 \n\t"
-      "movq %%mm2, 8(%2," PBX ",)  \n\t"
-      "movq 16(%2," PBX ",), %%mm4 \n\t"
-      "movq 24(%1," PBX ",), %%mm7 \n\t"
-      "paddb %%mm5, %%mm4          \n\t"
-      "movq 24(%2," PBX ",), %%mm6 \n\t"
-      "movq %%mm4, 16(%2," PBX ",) \n\t"
-      "paddb %%mm7, %%mm6          \n\t"
-      "movq 32(%1," PBX ",), %%mm1 \n\t"
-      "movq %%mm6, 24(%2," PBX ",) \n\t"
-      "movq 32(%2," PBX ",), %%mm0 \n\t"
-      "movq 40(%1," PBX ",), %%mm3 \n\t"
-      "paddb %%mm1, %%mm0          \n\t"
-      "movq 40(%2," PBX ",), %%mm2 \n\t"
-      "movq %%mm0, 32(%2," PBX ",) \n\t"
-      "paddb %%mm3, %%mm2          \n\t"
-      "movq 48(%1," PBX ",), %%mm5 \n\t"
-      "movq %%mm2, 40(%2," PBX ",) \n\t"
-      "movq 48(%2," PBX ",), %%mm4 \n\t"
-      "movq 56(%1," PBX ",), %%mm7 \n\t"
-      "paddb %%mm5, %%mm4          \n\t"
-      "movq 56(%2," PBX ",), %%mm6 \n\t"
-      "movq %%mm4, 48(%2," PBX ",) \n\t"
-      "addl $64, %%ebx             \n\t"
-      "paddb %%mm7, %%mm6          \n\t"
-      "cmpl %%ecx, %%ebx           \n\t"
-      "movq %%mm6, -8(%2," PBX ",) \n\t" // (+56)movq does not affect flags;
-      "jb up_loop                  \n\t" //  -8 to offset addl ebx
-
-      "cmpl $0, %%edx              \n\t" // test for bytes over mult of 64
-      "jz up_end                   \n\t"
-
-      "cmpl $8, %%edx              \n\t" // test for less than 8 bytes
-      "jb up_lt8                   \n\t" //  [added by lcreeve at netins.net]
-
-      "addl %%edx, %%ecx           \n\t"
-      "andl $0x00000007, %%edx     \n\t" // calc bytes over mult of 8
-      "subl %%edx, %%ecx           \n\t" // drop over-bytes from length
-      "jz up_lt8                   \n\t"
-
-   "up_lpA:                        \n\t" // use MMX regs to update 8 bytes sim.
-      "movq (%1," PBX ",), %%mm1   \n\t"
-      "movq (%2," PBX ",), %%mm0   \n\t"
-      "addl $8, %%ebx              \n\t"
-      "paddb %%mm1, %%mm0          \n\t"
-      "cmpl %%ecx, %%ebx           \n\t"
-      "movq %%mm0, -8(%2," PBX ",) \n\t" // movq does not affect flags; -8 to
-      "jb up_lpA                   \n\t" //  offset add ebx
-      "cmpl $0, %%edx              \n\t" // test for bytes over mult of 8
-      "jz up_end                   \n\t"
-
-   "up_lt8:                        \n\t"
-      "xorl %%eax, %%eax           \n\t"
-      "addl %%edx, %%ecx           \n\t" // move over byte count into counter
-
-   "up_lp2:                        \n\t" // use x86 regs for remaining bytes
-      "movb (%2," PBX ",), %%al    \n\t"
-      "addb (%1," PBX ",), %%al    \n\t"
-      "incl %%ebx                  \n\t"
-      "cmpl %%ecx, %%ebx           \n\t"
-      "movb %%al, -1(%2," PBX ",)  \n\t" // mov does not affect flags; -1 to
-      "jb up_lp2                   \n\t" //  offset inc ebx
-
-   "up_end:                        \n\t"
-      "EMMS                        \n\t" // conversion of filtered row complete
-      RESTORE_GOT_ebx
-
-      : "=d" (dummy_value_d),   // 0     // output regs (dummy)
-        "=S" (dummy_value_S),   // 1
-        "=D" (dummy_value_D)    // 2
-
-      : "0" (len),              // edx   // input regs
-        "1" (prev_row),         // esi
-        "2" (row)               // edi
-
-      : "%eax", "%ecx"                   // clobber list (no input regs!)
-        _CLOBBER_GOT_ebx
-#if defined(PNG_CLOBBER_MMX_REGS_SUPPORTED)
-      , "%mm0", "%mm1", "%mm2", "%mm3"
-      , "%mm4", "%mm5", "%mm6", "%mm7"
-#endif
-   );
-
-} // end of png_read_filter_row_mmx_up()
-
-#endif /* PNG_MMX_READ_FILTER_UP_SUPPORTED */
-
-
-
-
-/*===========================================================================*/
-/*                                                                           */
-/*                   P N G _ R E A D _ F I L T E R _ R O W                   */
-/*                                                                           */
-/*===========================================================================*/
-
-/* Optimized png_read_filter_row routines */
-
-void /* PRIVATE */
-png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
-   row, png_bytep prev_row, int filter)
-{
-#if defined(PNG_DEBUG)
-   char filtname[10];
-#endif
-
-   if (_mmx_supported == 2) {
-#if !defined(PNG_1_0_X)
-       /* this should have happened in png_init_mmx_flags() already */
-       png_warning(png_ptr, "asm_flags may not have been initialized");
-#endif
-       png_mmx_support();
-   }
-
-#if defined(PNG_DEBUG)
-   png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
-   switch (filter)
-   {
-      case 0:
-         png_snprintf(filtname, 10, "none");
-         break;
-
-      case 1:
-         png_snprintf(filtname, 10, "sub-%s",
-#ifdef PNG_MMX_READ_FILTER_SUB_SUPPORTED
-#if !defined(PNG_1_0_X)
-           ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
-            (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-            (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-           _mmx_supported
-#endif
-           ? "MMX" :
-#endif
-           "C");
-         break;
-
-      case 2:
-         png_snprintf(filtname, 10, "up-%s",
-#ifdef PNG_MMX_READ_FILTER_UP_SUPPORTED
-#if !defined(PNG_1_0_X)
-           ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
-            (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-            (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-           _mmx_supported
-#endif
-           ? "MMX" :
-#endif
-           "C");
-         break;
-
-      case 3:
-         png_snprintf(filtname, 10, "avg-%s",
-#ifdef PNG_MMX_READ_FILTER_AVG_SUPPORTED
-#if !defined(PNG_1_0_X)
-           ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
-            (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-            (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-           _mmx_supported
-#endif
-           ? "MMX" :
-#endif
-           "C");
-         break;
-
-      case 4:
-         png_snprintf(filtname, 10, "paeth-%s",
-#ifdef PNG_MMX_READ_FILTER_PAETH_SUPPORTED
-#if defined(PNG_x86_64_USE_GOTPCREL) || defined(PNG_THREAD_UNSAFE_OK)
-#if !defined(PNG_1_0_X)
-           ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
-            (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-            (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-           _mmx_supported
-#endif
-           ? "MMX" :
-#endif /* PNG_x86_64_USE_GOTPCREL || PNG_THREAD_UNSAFE_OK */
-#endif
-           "C");
-         break;
-
-      default:
-         png_snprintf(filtname, 10, "unknown");
-         break;
-   }
-   png_debug2(2, "row_number=%ld, %s, ", png_ptr->row_number, filtname);
-   //png_debug1(0, "png_ptr=%10p, ", png_ptr);
-   //png_debug1(0, "asm_flags=0x%08lx, ", png_ptr->asm_flags);
-   png_debug1(0, "row=%10p, ", row);
-   png_debug2(0, "pixdepth=%d, bytes=%d, ", (int)row_info->pixel_depth,
-      (int)((row_info->pixel_depth + 7) >> 3));
-   png_debug1(0, "rowbytes=%ld\n", row_info->rowbytes);
-#endif /* PNG_DEBUG */
-
-   switch (filter)
-   {
-      case PNG_FILTER_VALUE_NONE:
-         break;
-
-      case PNG_FILTER_VALUE_SUB:
-#ifdef PNG_MMX_READ_FILTER_SUB_SUPPORTED
-#if !defined(PNG_1_0_X)
-         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
-             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-         if (_mmx_supported)
-#endif
-         {
-            png_read_filter_row_mmx_sub(row_info, row);
-         }
-         else
-#endif
-         {
-            png_uint_32 i;
-            png_uint_32 istop = row_info->rowbytes;
-            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
-            png_bytep rp = row + bpp;
-            png_bytep lp = row;
-
-            for (i = bpp; i < istop; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
-               rp++;
-            }
-         }  /* end !UseMMX_sub */
-         break;
-
-      case PNG_FILTER_VALUE_UP:
-#ifdef PNG_MMX_READ_FILTER_UP_SUPPORTED
-#if !defined(PNG_1_0_X)
-         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
-             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-         if (_mmx_supported)
-#endif
-         {
-            png_read_filter_row_mmx_up(row_info, row, prev_row);
-         }
-          else
-#endif
-         {
-            png_uint_32 i;
-            png_uint_32 istop = row_info->rowbytes;
-            png_bytep rp = row;
-            png_bytep pp = prev_row;
-
-            for (i = 0; i < istop; ++i)
-            {
-               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
-               rp++;
-            }
-         }  /* end !UseMMX_up */
-         break;
-
-      case PNG_FILTER_VALUE_AVG:
-#ifdef PNG_MMX_READ_FILTER_AVG_SUPPORTED
-#if !defined(PNG_1_0_X)
-         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
-             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-         if (_mmx_supported)
-#endif
-         {
-            png_read_filter_row_mmx_avg(row_info, row, prev_row);
-         }
-         else
-#endif
-         {
-            png_uint_32 i;
-            png_bytep rp = row;
-            png_bytep pp = prev_row;
-            png_bytep lp = row;
-            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
-            png_uint_32 istop = row_info->rowbytes - bpp;
-
-            for (i = 0; i < bpp; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) +
-                  ((int)(*pp++) >> 1)) & 0xff);
-               rp++;
-            }
-
-            for (i = 0; i < istop; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) +
-                  ((int)(*pp++ + *lp++) >> 1)) & 0xff);
-               rp++;
-            }
-         }  /* end !UseMMX_avg */
-         break;
-
-      case PNG_FILTER_VALUE_PAETH:
-#ifdef PNG_MMX_READ_FILTER_PAETH_SUPPORTED
-#if defined(PNG_x86_64_USE_GOTPCREL) || defined(PNG_THREAD_UNSAFE_OK)
-#if !defined(PNG_1_0_X)
-         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
-             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-         if (_mmx_supported)
-#endif
-         {
-            png_read_filter_row_mmx_paeth(row_info, row, prev_row);
-         }
-         else
-#endif /* PNG_x86_64_USE_GOTPCREL || PNG_THREAD_UNSAFE_OK */
-#endif
-         {
-            png_uint_32 i;
-            png_bytep rp = row;
-            png_bytep pp = prev_row;
-            png_bytep lp = row;
-            png_bytep cp = prev_row;
-            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
-            png_uint_32 istop = row_info->rowbytes - bpp;
-
-            for (i = 0; i < bpp; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
-               rp++;
-            }
-
-            for (i = 0; i < istop; i++)   /* use leftover rp,pp */
-            {
-               int a, b, c, pa, pb, pc, p;
-
-               a = *lp++;
-               b = *pp++;
-               c = *cp++;
-
-               p = b - c;
-               pc = a - c;
-
-#if defined(PNG_USE_ABS)
-               pa = abs(p);
-               pb = abs(pc);
-               pc = abs(p + pc);
-#else
-               pa = p < 0 ? -p : p;
-               pb = pc < 0 ? -pc : pc;
-               pc = (p + pc) < 0 ? -(p + pc) : p + pc;
-#endif
-
-               /*
-                  if (pa <= pb && pa <= pc)
-                     p = a;
-                  else if (pb <= pc)
-                     p = b;
-                  else
-                     p = c;
-                */
-
-               p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
-
-               *rp = (png_byte)(((int)(*rp) + p) & 0xff);
-               rp++;
-            }
-         }  /* end !UseMMX_paeth */
-         break;
-
-      default:
-         png_warning(png_ptr, "Ignoring bad row-filter type");
-         *row=0;
-         break;
-   }
-}
-
-#endif /* PNG_HAVE_MMX_READ_FILTER_ROW */
-
-
-#endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGGCCRD */
-#endif /* __GNUC__ */
+/* pnggccrd.c was removed from libpng-1.2.20. */
diff --git a/pngget.c b/pngget.c
index 75d2ca0..9cf1b84 100644
--- a/pngget.c
+++ b/pngget.c
@@ -841,6 +841,7 @@
 png_uint_32 PNGAPI
 png_get_asm_flags (png_structp png_ptr)
 {
+    /* obsolete, to be removed from libpng-1.4.0 */
 #ifdef PNG_MMX_CODE_SUPPORTED
     return (png_uint_32)(png_ptr? png_ptr->asm_flags : 0L);
 #else
@@ -852,94 +853,36 @@
 png_uint_32 PNGAPI
 png_get_asm_flagmask (int flag_select)
 {
-#ifdef PNG_MMX_CODE_SUPPORTED
-    png_uint_32 settable_asm_flags = 0;
-
-    if (flag_select & PNG_SELECT_READ)
-        settable_asm_flags |=
-          PNG_ASM_FLAG_MMX_READ_COMBINE_ROW  |
-          PNG_ASM_FLAG_MMX_READ_INTERLACE    |
-          PNG_ASM_FLAG_MMX_READ_FILTER_SUB   |
-          PNG_ASM_FLAG_MMX_READ_FILTER_UP    |
-          PNG_ASM_FLAG_MMX_READ_FILTER_AVG   |
-          PNG_ASM_FLAG_MMX_READ_FILTER_PAETH ;
-          /* no non-MMX flags yet */
-
-#if 0
-    /* GRR:  no write-flags yet, either, but someday... */
-    if (flag_select & PNG_SELECT_WRITE)
-        settable_asm_flags |=
-          PNG_ASM_FLAG_MMX_WRITE_ [whatever] ;
-#endif /* 0 */
-
-    return settable_asm_flags;  /* _theoretically_ settable capabilities only */
-#else
-    return (0L);
-#endif /* PNG_MMX_CODE_SUPPORTED */
+    /* obsolete, to be removed from libpng-1.4.0 */
+    flag_select=flag_select;
+    return 0L;
 }
 
-
     /* GRR:  could add this:   && defined(PNG_MMX_CODE_SUPPORTED) */
 /* this function was added to libpng 1.2.0 */
 png_uint_32 PNGAPI
 png_get_mmx_flagmask (int flag_select, int *compilerID)
 {
-#if defined(PNG_MMX_CODE_SUPPORTED)
-    png_uint_32 settable_mmx_flags = 0;
-
-    if (flag_select & PNG_SELECT_READ)
-        settable_mmx_flags |=
-          PNG_ASM_FLAG_MMX_READ_COMBINE_ROW  |
-          PNG_ASM_FLAG_MMX_READ_INTERLACE    |
-          PNG_ASM_FLAG_MMX_READ_FILTER_SUB   |
-          PNG_ASM_FLAG_MMX_READ_FILTER_UP    |
-          PNG_ASM_FLAG_MMX_READ_FILTER_AVG   |
-          PNG_ASM_FLAG_MMX_READ_FILTER_PAETH ;
-#if 0
-    /* GRR:  no MMX write support yet, but someday... */
-    if (flag_select & PNG_SELECT_WRITE)
-        settable_mmx_flags |=
-          PNG_ASM_FLAG_MMX_WRITE_ [whatever] ;
-#endif /* 0 */
-
-    if (compilerID != NULL) {
-#ifdef PNG_USE_PNGVCRD
-        *compilerID = 1;    /* MSVC */
-#else
-#ifdef PNG_USE_PNGGCCRD
-        *compilerID = 2;    /* gcc/gas */
-#else
-        *compilerID = -1;   /* unknown (i.e., no asm/MMX code compiled) */
-#endif
-#endif
-    }
-
-    return settable_mmx_flags;  /* _theoretically_ settable capabilities only */
-#else
-    return (0L);
-#endif /* ?PNG_MMX_CODE_SUPPORTED */
+    /* obsolete, to be removed from libpng-1.4.0 */
+    flag_select=flag_select;
+    *compilerID = -1;   /* unknown (i.e., no asm/MMX code compiled) */
+    return 0L;
 }
 
 /* this function was added to libpng 1.2.0 */
 png_byte PNGAPI
 png_get_mmx_bitdepth_threshold (png_structp png_ptr)
 {
-#if defined(PNG_MMX_CODE_SUPPORTED)
-    return (png_byte)(png_ptr? png_ptr->mmx_bitdepth_threshold : 0);
-#else
+    /* obsolete, to be removed from libpng-1.4.0 */
     return (png_ptr? 0: 0);
-#endif /* ?PNG_MMX_CODE_SUPPORTED */
 }
 
 /* this function was added to libpng 1.2.0 */
 png_uint_32 PNGAPI
 png_get_mmx_rowbytes_threshold (png_structp png_ptr)
 {
-#if defined(PNG_MMX_CODE_SUPPORTED)
-    return (png_uint_32)(png_ptr? png_ptr->mmx_rowbytes_threshold : 0L);
-#else
+    /* obsolete, to be removed from libpng-1.4.0 */
     return (png_ptr? 0L: 0L);
-#endif /* ?PNG_MMX_CODE_SUPPORTED */
 }
 #endif /* ?PNG_1_0_X */
 #endif /* ?PNG_ASSEMBLER_CODE_SUPPORTED */
diff --git a/pngpread.c b/pngpread.c
index 3d910c3..d5952bb 100644
--- a/pngpread.c
+++ b/pngpread.c
@@ -1,7 +1,7 @@
 
 /* pngpread.c - read a png file in push mode
  *
- * Last changed in libpng 1.2.20 August 30, 2007
+ * Last changed in libpng 1.2.20 September 1, 2007
  * For conditions of distribution and use, see copyright notice in png.h
  * Copyright (c) 1998-2007 Glenn Randers-Pehrson
  * (Version 0.96 Copyright (c) 1996, 1997 Andreas Dilger)
@@ -1001,11 +1001,6 @@
    /* offset to next interlace block in the y direction */
    PNG_CONST int FARDATA png_pass_yinc[] = {8, 8, 8, 4, 4, 2, 2};
 
-   /* Width of interlace block.  This is not currently used - if you need
-    * it, uncomment it here and in png.h
-   PNG_CONST int FARDATA png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
-   */
-
    /* Height of interlace block.  This is not currently used - if you need
     * it, uncomment it here and in png.h
    PNG_CONST int FARDATA png_pass_height[] = {8, 8, 4, 4, 2, 2, 1};
diff --git a/pngread.c b/pngread.c
index c26ce61..c914030 100644
--- a/pngread.c
+++ b/pngread.c
@@ -55,12 +55,6 @@
    if (png_ptr == NULL)
       return (NULL);
 
-#if !defined(PNG_1_0_X)
-#ifdef PNG_MMX_CODE_SUPPORTED
-   png_init_mmx_flags(png_ptr);   /* 1.2.0 addition */
-#endif
-#endif /* PNG_1_0_X */
-
    /* added at libpng-1.2.6 */
 #ifdef PNG_SET_USER_LIMITS_SUPPORTED
    png_ptr->user_width_max=PNG_USER_WIDTH_MAX;
diff --git a/pngrutil.c b/pngrutil.c
index 4da862e..99bca19 100644
--- a/pngrutil.c
+++ b/pngrutil.c
@@ -2277,1051 +2277,10 @@
    to any alpha or transparency value associated with the pixel.  If
    you want all pixels to be combined, pass 0xff (255) in mask.  */
 
-/* Optimized C version of utilities to read a PNG file
- *
- * Based on code contributed by Nirav Chhatrapati, Intel Corp., 1998.
- * Interface to libpng contributed by Gilles Vollant, 1999.
- * GNU C port by Greg Roelofs, 1999-2001.
- *
- */
-
-#if defined(PNG_OPTIMIZED_CODE_SUPPORTED)
-#if !defined(PNG_HAVE_MMX_COMBINE_ROW)
-
-/*===========================================================================*/
-/*                                                                           */
-/*                       P N G _ C O M B I N E _ R O W                       */
-/*                                                                           */
-/*===========================================================================*/
-
-
-#define BPP2  2
-#define BPP3  3 /* bytes per pixel (a.k.a. pixel_bytes) */
-#define BPP4  4
-#define BPP6  6 /* (defined only to help avoid cut-and-paste errors) */
-#define BPP8  8
-
-/* Combines the row recently read in with the previous row.
-   This routine takes care of alpha and transparency if requested.
-   This routine also handles the two methods of progressive display
-   of interlaced images, depending on the mask value.
-   The mask value describes which pixels are to be combined with
-   the row.  The pattern always repeats every 8 pixels, so just 8
-   bits are needed.  A one indicates the pixel is to be combined; a
-   zero indicates the pixel is to be skipped.  This is in addition
-   to any alpha or transparency value associated with the pixel.
-   If you want all pixels to be combined, pass 0xff (255) in mask. */
-
-/* Use this routine for the x86 platform - it uses a faster MMX routine
-   if the machine supports MMX. */
-
 void /* PRIVATE */
 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
 {
-
-#if defined(PNG_USE_LOCAL_ARRAYS)
-static PNG_CONST int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
-static PNG_CONST int FARDATA png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
-static PNG_CONST int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
-#endif
-
-   png_debug(1, "in png_combine_row (pngrutil.c OPTIMIZED)\n");
-
-   if (mask == 0xff)
-   {
-      png_debug(2,"mask == 0xff:  doing single png_memcpy()\n");
-      png_memcpy(row, png_ptr->row_buf + 1,
-       (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
-   }
-   else   /* (png_combine_row() is never called with mask == 0) */
-   {
-      switch (png_ptr->row_info.pixel_depth)
-      {
-         /* most common case:  combining 24-bit RGB */
-         case 24:       /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-
-            {
-               register png_uint_32 i;
-               png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
-                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-               register int stride = BPP3 * png_pass_inc[png_ptr->pass];
-                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-               register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
-                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-               png_uint_32 len = png_ptr->width &~7; /* reduce to mult of 8 */
-               int diff = (int) (png_ptr->width & 7); /* amount lost */
-               register png_uint_32 final_val = BPP3 * len;   /* GRR bugfix */
-
-               srcptr = png_ptr->row_buf + 1 + initial_val;
-               dstptr = row + initial_val;
-
-               for (i = initial_val; i < final_val; i += stride)
-               {
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-               if (diff)  /* number of leftover pixels:  3 for pngtest */
-               {
-                  final_val += diff*BPP3;
-                  for (; i < final_val; i += stride)
-                  {
-                     if (rep_bytes > (int)(final_val-i))
-                        rep_bytes = (int)(final_val-i);
-                     png_memcpy(dstptr, srcptr, rep_bytes);
-                     srcptr += stride;
-                     dstptr += stride;
-                  }
-               }
-            } /* end of else (_mmx_supported) */
-
-            break;
-         }       /* end 24 bpp */
-
-         case 32:       /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-
-            {
-               register png_uint_32 i;
-               png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
-                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-               register int stride = BPP4 * png_pass_inc[png_ptr->pass];
-                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-               register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
-                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-               png_uint_32 len = png_ptr->width &~7; /* reduce to mult of 8 */
-               int diff = (int) (png_ptr->width & 7); /* amount lost */
-               register png_uint_32 final_val = BPP4 * len;   /* GRR bugfix */
-
-               srcptr = png_ptr->row_buf + 1 + initial_val;
-               dstptr = row + initial_val;
-
-               for (i = initial_val; i < final_val; i += stride)
-               {
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-               if (diff)  /* number of leftover pixels:  3 for pngtest */
-               {
-                  final_val += diff*BPP4;
-                  for (; i < final_val; i += stride)
-                  {
-                     if (rep_bytes > (int)(final_val-i))
-                        rep_bytes = (int)(final_val-i);
-                     png_memcpy(dstptr, srcptr, rep_bytes);
-                     srcptr += stride;
-                     dstptr += stride;
-                  }
-               }
-            }
-
-            break;
-         }       /* end 32 bpp */
-
-         case 8:        /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-            {
-               register png_uint_32 i;
-               png_uint_32 initial_val = png_pass_start[png_ptr->pass];
-                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-               register int stride = png_pass_inc[png_ptr->pass];
-                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-               register int rep_bytes = png_pass_width[png_ptr->pass];
-                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-               png_uint_32 len = png_ptr->width &~7; /* reduce to mult of 8 */
-               int diff = (int) (png_ptr->width & 7); /* amount lost */
-               register png_uint_32 final_val = len;  /* GRR bugfix */
-
-               srcptr = png_ptr->row_buf + 1 + initial_val;
-               dstptr = row + initial_val;
-
-               for (i = initial_val; i < final_val; i += stride)
-               {
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-               if (diff)  /* number of leftover pixels:  3 for pngtest */
-               {
-                  final_val += diff /* *BPP1 */ ;
-                  for (; i < final_val; i += stride)
-                  {
-                     if (rep_bytes > (int)(final_val-i))
-                        rep_bytes = (int)(final_val-i);
-                     png_memcpy(dstptr, srcptr, rep_bytes);
-                     srcptr += stride;
-                     dstptr += stride;
-                  }
-               }
-            }
-
-            break;
-         }       /* end 8 bpp */
-
-         case 1:        /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep sp;
-            png_bytep dp;
-            int s_inc, s_start, s_end;
-            int m;
-            int shift;
-            png_uint_32 i;
-
-            sp = png_ptr->row_buf + 1;
-            dp = row;
-            m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (png_ptr->transformations & PNG_PACKSWAP)
-            {
-                s_start = 0;
-                s_end = 7;
-                s_inc = 1;
-            }
-            else
-#endif
-            {
-                s_start = 7;
-                s_end = 0;
-                s_inc = -1;
-            }
-
-            shift = s_start;
-
-            for (i = 0; i < png_ptr->width; i++)
-            {
-               if (m & mask)
-               {
-                  int value;
-
-                  value = (*sp >> shift) & 0x1;
-                  *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
-                  *dp |= (png_byte)(value << shift);
-               }
-
-               if (shift == s_end)
-               {
-                  shift = s_start;
-                  sp++;
-                  dp++;
-               }
-               else
-                  shift += s_inc;
-
-               if (m == 1)
-                  m = 0x80;
-               else
-                  m >>= 1;
-            }
-            break;
-         }       /* end 1 bpp */
-
-         case 2:        /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep sp;
-            png_bytep dp;
-            int s_start, s_end, s_inc;
-            int m;
-            int shift;
-            png_uint_32 i;
-            int value;
-
-            sp = png_ptr->row_buf + 1;
-            dp = row;
-            m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (png_ptr->transformations & PNG_PACKSWAP)
-            {
-               s_start = 0;
-               s_end = 6;
-               s_inc = 2;
-            }
-            else
-#endif
-            {
-               s_start = 6;
-               s_end = 0;
-               s_inc = -2;
-            }
-
-            shift = s_start;
-
-            for (i = 0; i < png_ptr->width; i++)
-            {
-               if (m & mask)
-               {
-                  value = (*sp >> shift) & 0x3;
-                  *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
-                  *dp |= (png_byte)(value << shift);
-               }
-
-               if (shift == s_end)
-               {
-                  shift = s_start;
-                  sp++;
-                  dp++;
-               }
-               else
-                  shift += s_inc;
-               if (m == 1)
-                  m = 0x80;
-               else
-                  m >>= 1;
-            }
-            break;
-         }       /* end 2 bpp */
-
-         case 4:        /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep sp;
-            png_bytep dp;
-            int s_start, s_end, s_inc;
-            int m;
-            int shift;
-            png_uint_32 i;
-            int value;
-
-            sp = png_ptr->row_buf + 1;
-            dp = row;
-            m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (png_ptr->transformations & PNG_PACKSWAP)
-            {
-               s_start = 0;
-               s_end = 4;
-               s_inc = 4;
-            }
-            else
-#endif
-            {
-               s_start = 4;
-               s_end = 0;
-               s_inc = -4;
-            }
-            shift = s_start;
-
-            for (i = 0; i < png_ptr->width; i++)
-            {
-               if (m & mask)
-               {
-                  value = (*sp >> shift) & 0xf;
-                  *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
-                  *dp |= (png_byte)(value << shift);
-               }
-
-               if (shift == s_end)
-               {
-                  shift = s_start;
-                  sp++;
-                  dp++;
-               }
-               else
-                  shift += s_inc;
-               if (m == 1)
-                  m = 0x80;
-               else
-                  m >>= 1;
-            }
-            break;
-         }       /* end 4 bpp */
-
-         case 16:       /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-
-            {
-               register png_uint_32 i;
-               png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
-                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-               register int stride = BPP2 * png_pass_inc[png_ptr->pass];
-                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-               register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
-                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult of 8 */
-               int diff = (int) (png_ptr->width & 7); /* amount lost */
-               register png_uint_32 final_val = BPP2 * len;   /* GRR bugfix */
-
-               srcptr = png_ptr->row_buf + 1 + initial_val;
-               dstptr = row + initial_val;
-
-               for (i = initial_val; i < final_val; i += stride)
-               {
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-               if (diff)  /* number of leftover pixels:  3 for pngtest */
-               {
-                  final_val += diff*BPP2;
-                  for (; i < final_val; i += stride)
-                  {
-                     if (rep_bytes > (int)(final_val-i))
-                        rep_bytes = (int)(final_val-i);
-                     png_memcpy(dstptr, srcptr, rep_bytes);
-                     srcptr += stride;
-                     dstptr += stride;
-                  }
-               }
-            } /* end of else (_mmx_supported) */
-
-            break;
-         }       /* end 16 bpp */
-
-
-
-         case 48:       /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-            {
-               register png_uint_32 i;
-               png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
-                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-               register int stride = BPP6 * png_pass_inc[png_ptr->pass];
-                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-               register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
-                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-               png_uint_32 len = png_ptr->width &~7; /* reduce to mult of 8 */
-               int diff = (int) (png_ptr->width & 7); /* amount lost */
-               register png_uint_32 final_val = BPP6 * len;   /* GRR bugfix */
-
-               srcptr = png_ptr->row_buf + 1 + initial_val;
-               dstptr = row + initial_val;
-
-               for (i = initial_val; i < final_val; i += stride)
-               {
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-               if (diff)  /* number of leftover pixels:  3 for pngtest */
-               {
-                  final_val += diff*BPP6;
-                  for (; i < final_val; i += stride)
-                  {
-                     if (rep_bytes > (int)(final_val-i))
-                        rep_bytes = (int)(final_val-i);
-                     png_memcpy(dstptr, srcptr, rep_bytes);
-                     srcptr += stride;
-                     dstptr += stride;
-                  }
-               }
-            }
-            break;
-         }       /* end 48 bpp */
-
-         case 64:       /* png_ptr->row_info.pixel_depth */
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-            register png_uint_32 i;
-            png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
-              /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
-            register int stride = BPP8 * png_pass_inc[png_ptr->pass];
-              /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
-            register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
-              /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
-            png_uint_32 len = png_ptr->width &~7;  /* reduce to mult of 8 */
-            int diff = (int) (png_ptr->width & 7); /* amount lost */
-            register png_uint_32 final_val = BPP8 * len;   /* GRR bugfix */
-
-            srcptr = png_ptr->row_buf + 1 + initial_val;
-            dstptr = row + initial_val;
-
-            for (i = initial_val; i < final_val; i += stride)
-            {
-               png_memcpy(dstptr, srcptr, rep_bytes);
-               srcptr += stride;
-               dstptr += stride;
-            }
-            if (diff)  /* number of leftover pixels:  3 for pngtest */
-            {
-               final_val += diff*BPP8;
-               for (; i < final_val; i += stride)
-               {
-                  if (rep_bytes > (int)(final_val-i))
-                     rep_bytes = (int)(final_val-i);
-                  png_memcpy(dstptr, srcptr, rep_bytes);
-                  srcptr += stride;
-                  dstptr += stride;
-               }
-            }
-
-            break;
-         }       /* end 64 bpp */
-
-         default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
-         {
-            /* this should never happen */
-            png_warning(png_ptr, "Invalid row_info.pixel_depth in pngrutil");
-            break;
-         }
-      } /* end switch (png_ptr->row_info.pixel_depth) */
-
-   } /* end if (non-trivial mask) */
-
-} /* end png_combine_row() */
-#endif /* PNG_HAVE_MMX_COMBINE_ROW */
-
-
-
-/*===========================================================================*/
-/*                                                                           */
-/*                 P N G _ D O _ R E A D _ I N T E R L A C E                 */
-/*                                                                           */
-/*===========================================================================*/
-
-#if defined(PNG_READ_INTERLACING_SUPPORTED)
-#if !defined(PNG_HAVE_MMX_READ_INTERLACE)
-
-/* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
- * has taken place.  [GRR: what other steps come before and/or after?]
- */
-
-void /* PRIVATE */
-png_do_read_interlace(png_structp png_ptr)
-{
-#if defined(PNG_USE_LOCAL_ARRAYS)
-static PNG_CONST int FARDATA png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
-#endif
-   png_row_infop row_info = &(png_ptr->row_info);
-   png_bytep row = png_ptr->row_buf + 1;
-   int pass = png_ptr->pass;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-   png_uint_32 transformations = png_ptr->transformations;
-#endif
-   png_debug(1,"in png_do_read_interlace (pngrutil.c OPTIMIZED)\n");
-
-   if (row != NULL && row_info != NULL)
-   {
-      png_uint_32 final_width;
-
-      final_width = row_info->width * png_pass_inc[pass];
-
-      switch (row_info->pixel_depth)
-      {
-         case 1:
-         {
-            png_bytep sp, dp;
-            int sshift, dshift;
-            int s_start, s_end, s_inc;
-            png_byte v;
-            png_uint_32 i;
-            int j;
-
-            sp = row + (png_size_t)((row_info->width - 1) >> 3);
-            dp = row + (png_size_t)((final_width - 1) >> 3);
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (transformations & PNG_PACKSWAP)
-            {
-               sshift = (int)((row_info->width + 7) & 7);
-               dshift = (int)((final_width + 7) & 7);
-               s_start = 7;
-               s_end = 0;
-               s_inc = -1;
-            }
-            else
-#endif
-            {
-               sshift = 7 - (int)((row_info->width + 7) & 7);
-               dshift = 7 - (int)((final_width + 7) & 7);
-               s_start = 0;
-               s_end = 7;
-               s_inc = 1;
-            }
-
-            for (i = row_info->width; i; i--)
-            {
-               v = (png_byte)((*sp >> sshift) & 0x1);
-               for (j = 0; j < png_pass_inc[pass]; j++)
-               {
-                  *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
-                  *dp |= (png_byte)(v << dshift);
-                  if (dshift == s_end)
-                  {
-                     dshift = s_start;
-                     dp--;
-                  }
-                  else
-                     dshift += s_inc;
-               }
-               if (sshift == s_end)
-               {
-                  sshift = s_start;
-                  sp--;
-               }
-               else
-                  sshift += s_inc;
-            }
-            break;
-         }
-
-         case 2:
-         {
-            png_bytep sp, dp;
-            int sshift, dshift;
-            int s_start, s_end, s_inc;
-            png_uint_32 i;
-
-            sp = row + (png_size_t)((row_info->width - 1) >> 2);
-            dp = row + (png_size_t)((final_width - 1) >> 2);
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (transformations & PNG_PACKSWAP)
-            {
-               sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
-               dshift = (png_size_t)(((final_width + 3) & 3) << 1);
-               s_start = 6;
-               s_end = 0;
-               s_inc = -2;
-            }
-            else
-#endif
-            {
-               sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
-               dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
-               s_start = 0;
-               s_end = 6;
-               s_inc = 2;
-            }
-
-            for (i = row_info->width; i; i--)
-            {
-               png_byte v;
-               int j;
-
-               v = (png_byte)((*sp >> sshift) & 0x3);
-               for (j = 0; j < png_pass_inc[pass]; j++)
-               {
-                  *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
-                  *dp |= (png_byte)(v << dshift);
-                  if (dshift == s_end)
-                  {
-                     dshift = s_start;
-                     dp--;
-                  }
-                  else
-                     dshift += s_inc;
-               }
-               if (sshift == s_end)
-               {
-                  sshift = s_start;
-                  sp--;
-               }
-               else
-                  sshift += s_inc;
-            }
-            break;
-         }
-
-         case 4:
-         {
-            png_bytep sp, dp;
-            int sshift, dshift;
-            int s_start, s_end, s_inc;
-            png_uint_32 i;
-
-            sp = row + (png_size_t)((row_info->width - 1) >> 1);
-            dp = row + (png_size_t)((final_width - 1) >> 1);
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (transformations & PNG_PACKSWAP)
-            {
-               sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
-               dshift = (png_size_t)(((final_width + 1) & 1) << 2);
-               s_start = 4;
-               s_end = 0;
-               s_inc = -4;
-            }
-            else
-#endif
-            {
-               sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
-               dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
-               s_start = 0;
-               s_end = 4;
-               s_inc = 4;
-            }
-
-            for (i = row_info->width; i; i--)
-            {
-               png_byte v;
-               int j;
-
-               v = (png_byte)((*sp >> sshift) & 0xf);
-               for (j = 0; j < png_pass_inc[pass]; j++)
-               {
-                  *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
-                  *dp |= (png_byte)(v << dshift);
-                  if (dshift == s_end)
-                  {
-                     dshift = s_start;
-                     dp--;
-                  }
-                  else
-                     dshift += s_inc;
-               }
-               if (sshift == s_end)
-               {
-                  sshift = s_start;
-                  sp--;
-               }
-               else
-                  sshift += s_inc;
-            }
-            break;
-         }
-
-       /*====================================================================*/
-
-         default: /* 8-bit or larger (this is where the routine is modified) */
-         {
-            png_bytep sptr, dp;
-            png_uint_32 i;
-            png_size_t pixel_bytes;
-            int width = (int)row_info->width;
-
-            pixel_bytes = (row_info->pixel_depth >> 3);
-
-            /* point sptr at the last pixel in the pre-expanded row: */
-            sptr = row + (width - 1) * pixel_bytes;
-
-            /* point dp at the last pixel position in the expanded row: */
-            dp = row + (final_width - 1) * pixel_bytes;
-
-            /* MMX not supported:  use modified C code - takes advantage
-             *   of inlining of png_memcpy for a constant */
-            /* GRR 19991007:  does it?  or should pixel_bytes in each
-             *   block be replaced with immediate value (e.g., 1)? */
-            /* GRR 19991017:  replaced with constants in each case */
-            {
-               if (pixel_bytes == 1)
-               {
-                  for (i = width; i; i--)
-                  {
-                     int j;
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        *dp-- = *sptr;
-                     }
-                     --sptr;
-                  }
-               }
-               else if (pixel_bytes == 3)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, 3);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, 3);
-                        dp -= 3;
-                     }
-                     sptr -= 3;
-                  }
-               }
-               else if (pixel_bytes == 2)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, 2);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, 2);
-                        dp -= 2;
-                     }
-                     sptr -= 2;
-                  }
-               }
-               else if (pixel_bytes == 4)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, 4);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-#if defined(PNG_DEBUG) && defined(PNG_1_0_X)
-                        if (dp < row || dp+3 > row+png_ptr->row_buf_size)
-                        {
-                           printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
-                             row, dp, row+png_ptr->row_buf_size);
-                           printf("row_buf=%d\n",png_ptr->row_buf_size);
-                        }
-#endif
-                        png_memcpy(dp, v, 4);
-                        dp -= 4;
-                     }
-                     sptr -= 4;
-                  }
-               }
-               else if (pixel_bytes == 6)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, 6);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, 6);
-                        dp -= 6;
-                     }
-                     sptr -= 6;
-                  }
-               }
-               else if (pixel_bytes == 8)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, 8);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, 8);
-                        dp -= 8;
-                     }
-                     sptr -= 8;
-                  }
-               }
-               else     /* GRR:  should never be reached */
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, pixel_bytes);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, pixel_bytes);
-                        dp -= pixel_bytes;
-                     }
-                     sptr -= pixel_bytes;
-                  }
-               }
-
-            }
-            break;
-         }
-      } /* end switch (row_info->pixel_depth) */
-
-      row_info->width = final_width;
-
-      row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
-   }
-
-} /* end png_do_read_interlace() */
-
-#endif /* PNG_HAVE_MMX_READ_INTERLACE */
-#endif /* PNG_READ_INTERLACING_SUPPORTED */
-
-
-
-#if !defined(PNG_HAVE_MMX_READ_FILTER_ROW)
-/*===========================================================================*/
-/*                                                                           */
-/*                   P N G _ R E A D _ F I L T E R _ R O W                   */
-/*                                                                           */
-/*===========================================================================*/
-
-
-/* Optimized png_read_filter_row routines */
-
-void /* PRIVATE */
-png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
-   row, png_bytep prev_row, int filter)
-{
-#if defined(PNG_DEBUG)
-   char filnm[10];
-#endif
-
-
-#if defined(PNG_DEBUG)
-   png_debug(1, "in png_read_filter_row (pngrutil.c OPTIMIZED)\n");
-   switch (filter)
-   {
-      case 0:
-         png_snprintf(filnm, 10, "none");
-         break;
-
-      case 1:
-         png_snprintf(filnm, 10, "sub-%s",
-             "x86");
-         break;
-
-      case 2:
-         png_snprintf(filnm, 10, "up-%s",
-             "x86");
-         break;
-
-      case 3:
-         png_snprintf(filnm, 10, "avg-%s",
-             "x86");
-         break;
-
-      case 4:
-         png_snprintf(filnm, 10, "Paeth-%s",
-             "x86");
-         break;
-
-      default:
-         png_snprintf(filnm, 10, "unknown");
-         break;
-   }
-   png_debug2(0, "row_number=%5ld, %10s, ", png_ptr->row_number, filnm);
-   png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
-   png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
-      (int)((row_info->pixel_depth + 7) >> 3));
-   png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
-#endif /* PNG_DEBUG */
-
-   switch (filter)
-   {
-      case PNG_FILTER_VALUE_NONE:
-         break;
-
-      case PNG_FILTER_VALUE_SUB:
-         {
-            png_uint_32 i;
-            png_uint_32 istop = row_info->rowbytes;
-            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
-            png_bytep rp = row + bpp;
-            png_bytep lp = row;
-
-            for (i = bpp; i < istop; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
-               rp++;
-            }
-         }
-         break;
-
-      case PNG_FILTER_VALUE_UP:
-         {
-            png_uint_32 i;
-            png_uint_32 istop = row_info->rowbytes;
-            png_bytep rp = row;
-            png_bytep pp = prev_row;
-
-            for (i = 0; i < istop; ++i)
-            {
-               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
-               rp++;
-            }
-         }
-         break;
-
-      case PNG_FILTER_VALUE_AVG:
-         {
-            png_uint_32 i;
-            png_bytep rp = row;
-            png_bytep pp = prev_row;
-            png_bytep lp = row;
-            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
-            png_uint_32 istop = row_info->rowbytes - bpp;
-
-            for (i = 0; i < bpp; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) +
-                  ((int)(*pp++) >> 1)) & 0xff);
-               rp++;
-            }
-
-            for (i = 0; i < istop; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) +
-                  ((int)(*pp++ + *lp++) >> 1)) & 0xff);
-               rp++;
-            }
-         }
-         break;
-
-      case PNG_FILTER_VALUE_PAETH:
-         {
-            png_uint_32 i;
-            png_bytep rp = row;
-            png_bytep pp = prev_row;
-            png_bytep lp = row;
-            png_bytep cp = prev_row;
-            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
-            png_uint_32 istop = row_info->rowbytes - bpp;
-
-            for (i = 0; i < bpp; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
-               rp++;
-            }
-
-            for (i = 0; i < istop; i++)   /* use leftover rp,pp */
-            {
-               int a, b, c, pa, pb, pc, p;
-
-               a = *lp++;
-               b = *pp++;
-               c = *cp++;
-
-               p = b - c;
-               pc = a - c;
-
-#if defined(PNG_USE_ABS)
-               pa = abs(p);
-               pb = abs(pc);
-               pc = abs(p + pc);
-#else
-               pa = p < 0 ? -p : p;
-               pb = pc < 0 ? -pc : pc;
-               pc = (p + pc) < 0 ? -(p + pc) : p + pc;
-#endif
-
-               /*
-                  if (pa <= pb && pa <= pc)
-                     p = a;
-                  else if (pb <= pc)
-                     p = b;
-                  else
-                     p = c;
-                */
-
-               p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
-
-               *rp = (png_byte)(((int)(*rp) + p) & 0xff);
-               rp++;
-            }
-         }
-         break;
-
-      default:
-         png_warning(png_ptr, "Ignoring bad row-filter type");
-         *row=0;
-         break;
-   }
-}
-
-#endif /* PNG_HAVE_MMX_READ_FILTER_ROW */
-#endif /* PNG_OPTIMIZED_CODE_SUPPORTED */
-
-#if !defined(PNG_ASSEMBLER_CODE_SUPPORTED) || \
-    !defined(PNG_MMX_CODE_SUPPORTED) || \
-    (!defined(PNG_USE_PNGGCCRD) && !defined(PNG_USE_PNGVCRD))
-#if !defined(PNG_OPTIMIZED_CODE_SUPPORTED)
-/* Use the unoptimized original C code.  This might be removed from a future
- * version of libpng if testing proves it to be worthless. */
-void /* PRIVATE */
-png_combine_row(png_structp png_ptr, png_bytep row, int mask)
-{
-   png_debug(1,"in png_combine_row NOT OPTIMIZED\n");
+   png_debug(1,"in png_combine_row\n");
    if (mask == 0xff)
    {
       png_memcpy(row, png_ptr->row_buf + 1,
@@ -3537,7 +2496,7 @@
    PNG_CONST int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
 #endif
 
-   png_debug(1,"in png_do_read_interlace (pngrutil.c NOT OPTIMIZED)\n");
+   png_debug(1,"in png_do_read_interlace\n");
    if (row != NULL && row_info != NULL)
    {
       png_uint_32 final_width;
@@ -3750,7 +2709,7 @@
 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep row,
    png_bytep prev_row, int filter)
 {
-   png_debug(1, "in png_read_filter_row (NOT OPTIMIZED)\n");
+   png_debug(1, "in png_read_filter_row\n");
    png_debug2(2,"row = %lu, filter = %d\n", png_ptr->row_number, filter);
    switch (filter)
    {
@@ -3868,9 +2827,6 @@
          break;
    }
 }
-#endif /* !PNG_OPTIMIZED_CODE_SUPPORTED */
-#endif /* !PNG_ASSEMBLER_CODE_SUPPORTED || !PNG_MMX_CODE_SUPPORTED || */
-       /* (!PNG_USE_PNGGCCRD && !PNG_USE_PNGVCRD) */
 
 void /* PRIVATE */
 png_read_finish_row(png_structp png_ptr)
@@ -4165,9 +3121,6 @@
 #endif
    png_ptr->big_row_buf = (png_bytep)png_malloc(png_ptr, row_bytes+64);
    png_ptr->row_buf = png_ptr->big_row_buf+32;
-#if defined(PNG_DEBUG) && defined(PNG_USE_PNGGCCRD) && defined(PNG_1_0_X)
-   png_ptr->row_buf_size = row_bytes;
-#endif
 
 #ifdef PNG_MAX_MALLOC_64K
    if ((png_uint_32)png_ptr->rowbytes + 1 > (png_uint_32)65536L)
diff --git a/pngset.c b/pngset.c
index 8d1cbfc..2871140 100644
--- a/pngset.c
+++ b/pngset.c
@@ -1202,51 +1202,13 @@
 
 #ifndef PNG_1_0_X
 #ifdef PNG_ASSEMBLER_CODE_SUPPORTED
-/* this function was added to libpng 1.2.0 and should always exist by default */
+/* function was added to libpng 1.2.0 and should always exist by default */
 void PNGAPI
 png_set_asm_flags (png_structp png_ptr, png_uint_32 asm_flags)
 {
-#ifdef PNG_MMX_CODE_SUPPORTED
-    png_uint_32 settable_asm_flags;
-    png_uint_32 settable_mmx_flags;
-#endif
-    if (png_ptr == NULL)
-       return;
-#ifdef PNG_MMX_CODE_SUPPORTED
-
-    settable_mmx_flags =
-#ifdef PNG_HAVE_MMX_COMBINE_ROW
-                         PNG_ASM_FLAG_MMX_READ_COMBINE_ROW  |
-#endif
-#ifdef PNG_HAVE_MMX_READ_INTERLACE
-                         PNG_ASM_FLAG_MMX_READ_INTERLACE    |
-#endif
-#ifdef PNG_HAVE_MMX_READ_FILTER_ROW
-                         PNG_ASM_FLAG_MMX_READ_FILTER_SUB   |
-                         PNG_ASM_FLAG_MMX_READ_FILTER_UP    |
-                         PNG_ASM_FLAG_MMX_READ_FILTER_AVG   |
-                         PNG_ASM_FLAG_MMX_READ_FILTER_PAETH |
-#endif
-                         0;
-
-    /* could be some non-MMX ones in the future, but not currently: */
-    settable_asm_flags = settable_mmx_flags;
-
-    if (!(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_SUPPORT_COMPILED) ||
-        !(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_SUPPORT_IN_CPU))
-    {
-        /* clear all MMX flags if MMX isn't supported */
-        settable_asm_flags &= ~settable_mmx_flags;
-        png_ptr->asm_flags &= ~settable_mmx_flags;
-    }
-
-    /* we're replacing the settable bits with those passed in by the user,
-     * so first zero them out of the master copy, then bitwise-OR in the
-     * allowed subset that was requested */
-
-    png_ptr->asm_flags &= ~settable_asm_flags;               /* zero them */
-    png_ptr->asm_flags |= (asm_flags & settable_asm_flags);  /* set them */
-#endif /* ?PNG_MMX_CODE_SUPPORTED */
+/* Obsolete as of libpng-1.2.20 and will be removed from libpng-1.4.0 */
+    if (png_ptr != NULL)
+    png_ptr->asm_flags = 0;
 }
 
 /* this function was added to libpng 1.2.0 */
@@ -1255,12 +1217,9 @@
                         png_byte mmx_bitdepth_threshold,
                         png_uint_32 mmx_rowbytes_threshold)
 {
+/* Obsolete as of libpng-1.2.20 and will be removed from libpng-1.4.0 */
     if (png_ptr == NULL)
        return;
-#ifdef PNG_MMX_CODE_SUPPORTED
-    png_ptr->mmx_bitdepth_threshold = mmx_bitdepth_threshold;
-    png_ptr->mmx_rowbytes_threshold = mmx_rowbytes_threshold;
-#endif /* ?PNG_MMX_CODE_SUPPORTED */
 }
 #endif /* ?PNG_ASSEMBLER_CODE_SUPPORTED */
 
diff --git a/pngtest.c b/pngtest.c
index c46d9177..70734a3 100644
--- a/pngtest.c
+++ b/pngtest.c
@@ -1548,4 +1548,4 @@
 }
 
 /* Generate a compiler error if there is an old png.h in the search path. */
-typedef version_1_2_20rc3 your_png_h_is_not_version_1_2_20rc3;
+typedef version_1_2_20rc4 your_png_h_is_not_version_1_2_20rc4;
diff --git a/pngvcrd.c b/pngvcrd.c
index edd8ff6..ce4233e 100644
--- a/pngvcrd.c
+++ b/pngvcrd.c
@@ -1,3917 +1 @@
-
-/* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
- *
- * For Intel x86 CPU and Microsoft Visual C++ compiler
- *
- * Last changed in libpng 1.2.20 August 30, 2007
- * For conditions of distribution and use, see copyright notice in png.h
- * Copyright (c) 1998-2007 Glenn Randers-Pehrson
- * Copyright (c) 1998, Intel Corporation
- *
- * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
- * Interface to libpng contributed by Gilles Vollant, 1999
- *
- *
- * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
- * a sign error in the post-MMX cleanup code for each pixel_depth resulted
- * in bad pixels at the beginning of some rows of some images, and also
- * (due to out-of-range memory reads and writes) caused heap corruption
- * when compiled with MSVC 6.0.  The error was fixed in version 1.0.4e.
- *
- * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
- *
- * [runtime MMX configuration, GRR 20010102]
- *
- * [Copy 6 bytes per pixel, not 4, and use stride of 6, not 4, in the
- *  second loop of interlace processing of 48-bit pixels, GR-P 20070717]
- *
- * [move instances of uAll union into local, except for two constant
- * instances, GR-P 20070805]
- *
- * [revised handling of static constants for efficiency, Steve Snyder 20070821]
- */
-
-#define PNG_INTERNAL
-#include "png.h"
-
-#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && \
-    defined(PNG_USE_PNGVCRD) && defined(PNG_MMX_CODE_SUPPORTED)
-
-#ifdef PNG_USE_LOCAL_ARRAYS
-static PNG_CONST int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
-#endif
-
-static int mmx_supported=2;
-
-int PNGAPI
-png_mmx_support(void)
-{
-  int mmx_supported_local = 0;
-  _asm {
-    push ebx          //CPUID will trash these
-    push ecx
-    push edx
-
-    pushfd            //Save Eflag to stack
-    pop eax           //Get Eflag from stack into eax
-    mov ecx, eax      //Make another copy of Eflag in ecx
-    xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
-    push eax          //Save modified Eflag back to stack
-
-    popfd             //Restored modified value back to Eflag reg
-    pushfd            //Save Eflag to stack
-    pop eax           //Get Eflag from stack
-    push ecx          // save original Eflag to stack
-    popfd             // restore original Eflag
-    xor eax, ecx      //Compare the new Eflag with the original Eflag
-    jz NOT_SUPPORTED  //If the same, CPUID instruction is not supported,
-                      //skip following instructions and jump to
-                      //NOT_SUPPORTED label
-
-    xor eax, eax      //Set eax to zero
-
-    _asm _emit 0x0f   //CPUID instruction  (two bytes opcode)
-    _asm _emit 0xa2
-
-    cmp eax, 1        //make sure eax return non-zero value
-    jl NOT_SUPPORTED  //If eax is zero, mmx not supported
-
-    xor eax, eax      //set eax to zero
-    inc eax           //Now increment eax to 1.  This instruction is
-                      //faster than the instruction "mov eax, 1"
-
-    _asm _emit 0x0f   //CPUID instruction
-    _asm _emit 0xa2
-
-    and edx, 0x00800000  //mask out all bits but mmx bit(24)
-    cmp edx, 0        // 0 = mmx not supported
-    jz  NOT_SUPPORTED // non-zero = Yes, mmx IS supported
-
-    mov  mmx_supported_local, 1  //set return value to 1
-
-NOT_SUPPORTED:
-    mov  eax, mmx_supported_local  //move return value to eax
-    pop edx          //CPUID trashed these
-    pop ecx
-    pop ebx
-  }
-
-  //mmx_supported_local=0; // test code for force don't support MMX
-  //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
-
-  mmx_supported = mmx_supported_local;
-  return mmx_supported_local;
-}
-
-/* Combines the row recently read in with the previous row.
-   This routine takes care of alpha and transparency if requested.
-   This routine also handles the two methods of progressive display
-   of interlaced images, depending on the mask value.
-   The mask value describes which pixels are to be combined with
-   the row.  The pattern always repeats every 8 pixels, so just 8
-   bits are needed.  A one indicates the pixel is to be combined; a
-   zero indicates the pixel is to be skipped.  This is in addition
-   to any alpha or transparency value associated with the pixel.  If
-   you want all pixels to be combined, pass 0xff (255) in mask.  */
-
-/* Use this routine for x86 platform - uses faster MMX routine if machine
-   supports MMX */
-
-void /* PRIVATE */
-png_combine_row(png_structp png_ptr, png_bytep row, int mask)
-{
-   static PNG_CONST int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
-
-   png_debug(1,"in png_combine_row_asm\n");
-
-   if (mmx_supported == 2) {
-#if !defined(PNG_1_0_X)
-       /* this should have happened in png_init_mmx_flags() already */
-       png_warning(png_ptr, "asm_flags may not have been initialized");
-#endif
-       png_mmx_support();
-   }
-
-   if (mask == 0xff)
-   {
-      png_memcpy(row, png_ptr->row_buf + 1,
-       (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,
-       png_ptr->width));
-   }
-   /* GRR:  add "else if (mask == 0)" case?
-    *       or does png_combine_row() not even get called in that case? */
-   else
-   {
-      switch (png_ptr->row_info.pixel_depth)
-      {
-         case 24:
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-            png_uint_32 len;
-            int unmask, diff;
-
-            static PNG_CONST __int64 bpp24_mask2=0x0101010202020404,  //24bpp
-                                     bpp24_mask1=0x0408080810101020,
-                                     bpp24_mask0=0x2020404040808080;
-
-            srcptr = png_ptr->row_buf + 1;
-            dstptr = row;
-
-            unmask = ~mask;
-            len     = (png_ptr->width)&~7;
-            diff = (png_ptr->width)&7;
-
-#if !defined(PNG_1_0_X)
-            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
-                /* && mmx_supported */ )
-#else
-            if (mmx_supported)
-#endif
-            {
-               _asm
-               {
-                  movd       mm7, unmask       //load bit pattern
-                  psubb      mm6,mm6           //zero mm6
-                  punpcklbw  mm7,mm7
-                  punpcklwd  mm7,mm7
-                  punpckldq  mm7,mm7           //fill register with 8 masks
-
-                  movq       mm0,bpp24_mask0
-                  movq       mm1,bpp24_mask1
-                  movq       mm2,bpp24_mask2
-
-                  pand       mm0,mm7
-                  pand       mm1,mm7
-                  pand       mm2,mm7
-
-                  pcmpeqb    mm0,mm6
-                  pcmpeqb    mm1,mm6
-                  pcmpeqb    mm2,mm6
-
-                  mov        ecx,len           //load length of line
-                  mov        esi,srcptr        //load source
-                  mov        ebx,dstptr        //load dest
-                  cmp        ecx,0
-                  jz         mainloop24end
-
-mainloop24:
-                  movq       mm4,[esi]
-                  pand       mm4,mm0
-                  movq       mm6,mm0
-                  movq       mm7,[ebx]
-                  pandn      mm6,mm7
-                  por        mm4,mm6
-                  movq       [ebx],mm4
-
-
-                  movq       mm5,[esi+8]
-                  pand       mm5,mm1
-                  movq       mm7,mm1
-                  movq       mm6,[ebx+8]
-                  pandn      mm7,mm6
-                  por        mm5,mm7
-                  movq       [ebx+8],mm5
-
-                  movq       mm6,[esi+16]
-                  pand       mm6,mm2
-                  movq       mm4,mm2
-                  movq       mm7,[ebx+16]
-                  pandn      mm4,mm7
-                  por        mm6,mm4
-                  movq       [ebx+16],mm6
-
-                  add        esi,24            //inc by 24 bytes processed
-                  add        ebx,24
-                  sub        ecx,8             //dec by 8 pixels processed
-
-                  ja         mainloop24
-
-mainloop24end:
-                  mov        ecx,diff
-                  cmp        ecx,0
-                  jz         end24
-
-                  mov        edx,mask
-                  sal        edx,24            //make low byte the high byte
-secondloop24:
-                  sal        edx,1             //move high bit to CF
-                  jnc        skip24            //if CF = 0
-                  mov        ax,[esi]
-                  mov        [ebx],ax
-                  xor        eax,eax
-                  mov        al,[esi+2]
-                  mov        [ebx+2],al
-skip24:
-                  add        esi,3
-                  add        ebx,3
-
-                  dec        ecx
-                  jnz        secondloop24
-
-end24:
-                  emms
-               }
-            }
-            else /* mmx not supported - use modified C routine */
-            {
-               register unsigned int incr1, initial_val, final_val;
-               png_size_t pixel_bytes;
-               png_uint_32 i;
-               register int disp = png_pass_inc[png_ptr->pass];
-
-               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
-               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
-                  pixel_bytes;
-               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
-               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
-               final_val = png_ptr->width*pixel_bytes;
-               incr1 = (disp)*pixel_bytes;
-               for (i = initial_val; i < final_val; i += incr1)
-               {
-                  png_memcpy(dstptr, srcptr, pixel_bytes);
-                  srcptr += incr1;
-                  dstptr += incr1;
-               }
-            } /* end of else */
-
-            break;
-         }       // end 24 bpp
-
-         case 32:
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-            png_uint_32 len;
-            int unmask, diff;
-
-            static PNG_CONST __int64 bpp32_mask3=0x0101010102020202,  //32bpp
-                                     bpp32_mask2=0x0404040408080808,
-                                     bpp32_mask1=0x1010101020202020,
-                                     bpp32_mask0=0x4040404080808080;
-
-            srcptr = png_ptr->row_buf + 1;
-            dstptr = row;
-
-            unmask = ~mask;
-            len     = (png_ptr->width)&~7;
-            diff = (png_ptr->width)&7;
-
-#if !defined(PNG_1_0_X)
-            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
-                /* && mmx_supported */ )
-#else
-            if (mmx_supported)
-#endif
-            {
-               _asm
-               {
-                  movd       mm7, unmask       //load bit pattern
-                  psubb      mm6,mm6           //zero mm6
-                  punpcklbw  mm7,mm7
-                  punpcklwd  mm7,mm7
-                  punpckldq  mm7,mm7           //fill register with 8 masks
-
-                  movq       mm0,bpp32_mask0
-                  movq       mm1,bpp32_mask1
-                  movq       mm2,bpp32_mask2
-                  movq       mm3,bpp32_mask3
-
-                  pand       mm0,mm7
-                  pand       mm1,mm7
-                  pand       mm2,mm7
-                  pand       mm3,mm7
-
-                  pcmpeqb    mm0,mm6
-                  pcmpeqb    mm1,mm6
-                  pcmpeqb    mm2,mm6
-                  pcmpeqb    mm3,mm6
-
-                  mov        ecx,len           //load length of line
-                  mov        esi,srcptr        //load source
-                  mov        ebx,dstptr        //load dest
-
-                  cmp        ecx,0             //lcr
-                  jz         mainloop32end
-
-mainloop32:
-                  movq       mm4,[esi]
-                  pand       mm4,mm0
-                  movq       mm6,mm0
-                  movq       mm7,[ebx]
-                  pandn      mm6,mm7
-                  por        mm4,mm6
-                  movq       [ebx],mm4
-
-                  movq       mm5,[esi+8]
-                  pand       mm5,mm1
-                  movq       mm7,mm1
-                  movq       mm6,[ebx+8]
-                  pandn      mm7,mm6
-                  por        mm5,mm7
-                  movq       [ebx+8],mm5
-
-                  movq       mm6,[esi+16]
-                  pand       mm6,mm2
-                  movq       mm4,mm2
-                  movq       mm7,[ebx+16]
-                  pandn      mm4,mm7
-                  por        mm6,mm4
-                  movq       [ebx+16],mm6
-
-                  movq       mm7,[esi+24]
-                  pand       mm7,mm3
-                  movq       mm5,mm3
-                  movq       mm4,[ebx+24]
-                  pandn      mm5,mm4
-                  por        mm7,mm5
-                  movq       [ebx+24],mm7
-
-                  add        esi,32            //inc by 32 bytes processed
-                  add        ebx,32
-                  sub        ecx,8             //dec by 8 pixels processed
-
-                  ja         mainloop32
-
-mainloop32end:
-                  mov        ecx,diff
-                  cmp        ecx,0
-                  jz         end32
-
-                  mov        edx,mask
-                  sal        edx,24            //make low byte the high byte
-secondloop32:
-                  sal        edx,1             //move high bit to CF
-                  jnc        skip32            //if CF = 0
-                  mov        eax,[esi]
-                  mov        [ebx],eax
-skip32:
-                  add        esi,4
-                  add        ebx,4
-
-                  dec        ecx
-                  jnz        secondloop32
-
-end32:
-                  emms
-               }
-            }
-            else /* mmx _not supported - Use modified C routine */
-            {
-               register unsigned int incr1, initial_val, final_val;
-               png_size_t pixel_bytes;
-               png_uint_32 i;
-               register int disp = png_pass_inc[png_ptr->pass];
-
-               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
-               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
-                  pixel_bytes;
-               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
-               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
-               final_val = png_ptr->width*pixel_bytes;
-               incr1 = (disp)*pixel_bytes;
-               for (i = initial_val; i < final_val; i += incr1)
-               {
-                  png_memcpy(dstptr, srcptr, pixel_bytes);
-                  srcptr += incr1;
-                  dstptr += incr1;
-               }
-            } /* end of else */
-
-            break;
-         }       // end 32 bpp
-
-         case 8:
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-            png_uint_32 len;
-            int m;
-            int diff, unmask;
-
-            static PNG_CONST __int64 bpp08_mask0=0x0102040810204080;
-
-#if !defined(PNG_1_0_X)
-            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
-                /* && mmx_supported */ )
-#else
-            if (mmx_supported)
-#endif
-            {
-               srcptr = png_ptr->row_buf + 1;
-               dstptr = row;
-               m = 0x80;
-               unmask = ~mask;
-               len  = png_ptr->width &~7;  //reduce to multiple of 8
-               diff = png_ptr->width & 7;  //amount lost
-
-               _asm
-               {
-                  movd       mm7, unmask   //load bit pattern
-                  psubb      mm6,mm6       //zero mm6
-                  punpcklbw  mm7,mm7
-                  punpcklwd  mm7,mm7
-                  punpckldq  mm7,mm7       //fill register with 8 masks
-
-                  movq       mm0,bpp08_mask0
-
-                  pand       mm0,mm7       //nonzero if keep byte
-                  pcmpeqb    mm0,mm6       //zeros->1s, v versa
-
-                  mov        ecx,len       //load length of line (pixels)
-                  mov        esi,srcptr    //load source
-                  mov        ebx,dstptr    //load dest
-                  cmp        ecx,0         //lcr
-                  je         mainloop8end
-
-mainloop8:
-                  movq       mm4,[esi]
-                  pand       mm4,mm0
-                  movq       mm6,mm0
-                  pandn      mm6,[ebx]
-                  por        mm4,mm6
-                  movq       [ebx],mm4
-
-                  add        esi,8         //inc by 8 bytes processed
-                  add        ebx,8
-                  sub        ecx,8         //dec by 8 pixels processed
-
-                  ja         mainloop8
-mainloop8end:
-
-                  mov        ecx,diff
-                  cmp        ecx,0
-                  jz         end8
-
-                  mov        edx,mask
-                  sal        edx,24        //make low byte the high byte
-
-secondloop8:
-                  sal        edx,1         //move high bit to CF
-                  jnc        skip8         //if CF = 0
-                  mov        al,[esi]
-                  mov        [ebx],al
-skip8:
-                  inc        esi
-                  inc        ebx
-
-                  dec        ecx
-                  jnz        secondloop8
-end8:
-                  emms
-               }
-            }
-            else /* mmx not supported - use modified C routine */
-            {
-               register unsigned int incr1, initial_val, final_val;
-               png_size_t pixel_bytes;
-               png_uint_32 i;
-               register int disp = png_pass_inc[png_ptr->pass];
-
-               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
-               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
-                  pixel_bytes;
-               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
-               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
-               final_val = png_ptr->width*pixel_bytes;
-               incr1 = (disp)*pixel_bytes;
-               for (i = initial_val; i < final_val; i += incr1)
-               {
-                  png_memcpy(dstptr, srcptr, pixel_bytes);
-                  srcptr += incr1;
-                  dstptr += incr1;
-               }
-            } /* end of else */
-
-            break;
-         }       // end 8 bpp
-
-         case 1:
-         {
-            png_bytep sp;
-            png_bytep dp;
-            int s_inc, s_start, s_end;
-            int m;
-            int shift;
-            png_uint_32 i;
-
-            sp = png_ptr->row_buf + 1;
-            dp = row;
-            m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (png_ptr->transformations & PNG_PACKSWAP)
-            {
-                s_start = 0;
-                s_end = 7;
-                s_inc = 1;
-            }
-            else
-#endif
-            {
-                s_start = 7;
-                s_end = 0;
-                s_inc = -1;
-            }
-
-            shift = s_start;
-
-            for (i = 0; i < png_ptr->width; i++)
-            {
-               if (m & mask)
-               {
-                  int value;
-
-                  value = (*sp >> shift) & 0x1;
-                  *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
-                  *dp |= (png_byte)(value << shift);
-               }
-
-               if (shift == s_end)
-               {
-                  shift = s_start;
-                  sp++;
-                  dp++;
-               }
-               else
-                  shift += s_inc;
-
-               if (m == 1)
-                  m = 0x80;
-               else
-                  m >>= 1;
-            }
-            break;
-         }
-
-         case 2:
-         {
-            png_bytep sp;
-            png_bytep dp;
-            int s_start, s_end, s_inc;
-            int m;
-            int shift;
-            png_uint_32 i;
-            int value;
-
-            sp = png_ptr->row_buf + 1;
-            dp = row;
-            m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (png_ptr->transformations & PNG_PACKSWAP)
-            {
-               s_start = 0;
-               s_end = 6;
-               s_inc = 2;
-            }
-            else
-#endif
-            {
-               s_start = 6;
-               s_end = 0;
-               s_inc = -2;
-            }
-
-            shift = s_start;
-
-            for (i = 0; i < png_ptr->width; i++)
-            {
-               if (m & mask)
-               {
-                  value = (*sp >> shift) & 0x3;
-                  *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
-                  *dp |= (png_byte)(value << shift);
-               }
-
-               if (shift == s_end)
-               {
-                  shift = s_start;
-                  sp++;
-                  dp++;
-               }
-               else
-                  shift += s_inc;
-               if (m == 1)
-                  m = 0x80;
-               else
-                  m >>= 1;
-            }
-            break;
-         }
-
-         case 4:
-         {
-            png_bytep sp;
-            png_bytep dp;
-            int s_start, s_end, s_inc;
-            int m;
-            int shift;
-            png_uint_32 i;
-            int value;
-
-            sp = png_ptr->row_buf + 1;
-            dp = row;
-            m = 0x80;
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (png_ptr->transformations & PNG_PACKSWAP)
-            {
-               s_start = 0;
-               s_end = 4;
-               s_inc = 4;
-            }
-            else
-#endif
-            {
-               s_start = 4;
-               s_end = 0;
-               s_inc = -4;
-            }
-            shift = s_start;
-
-            for (i = 0; i < png_ptr->width; i++)
-            {
-               if (m & mask)
-               {
-                  value = (*sp >> shift) & 0xf;
-                  *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
-                  *dp |= (png_byte)(value << shift);
-               }
-
-               if (shift == s_end)
-               {
-                  shift = s_start;
-                  sp++;
-                  dp++;
-               }
-               else
-                  shift += s_inc;
-               if (m == 1)
-                  m = 0x80;
-               else
-                  m >>= 1;
-            }
-            break;
-         }
-
-         case 16:
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-            png_uint_32 len;
-            int unmask, diff;
-            static PNG_CONST __int64 bpp16_mask1=0x0101020204040808,
-                                     bpp16_mask0=0x1010202040408080;
-
-#if !defined(PNG_1_0_X)
-            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
-                /* && mmx_supported */ )
-#else
-            if (mmx_supported)
-#endif
-            {
-               srcptr = png_ptr->row_buf + 1;
-               dstptr = row;
-
-               unmask = ~mask;
-               len     = (png_ptr->width)&~7;
-               diff = (png_ptr->width)&7;
-               _asm
-               {
-                  movd       mm7, unmask       //load bit pattern
-                  psubb      mm6,mm6           //zero mm6
-                  punpcklbw  mm7,mm7
-                  punpcklwd  mm7,mm7
-                  punpckldq  mm7,mm7           //fill register with 8 masks
-
-                  movq       mm0,bpp16_mask0
-                  movq       mm1,bpp16_mask1
-
-                  pand       mm0,mm7
-                  pand       mm1,mm7
-
-                  pcmpeqb    mm0,mm6
-                  pcmpeqb    mm1,mm6
-
-                  mov        ecx,len           //load length of line
-                  mov        esi,srcptr        //load source
-                  mov        ebx,dstptr        //load dest
-                  cmp        ecx,0             //lcr
-                  jz         mainloop16end
-
-mainloop16:
-                  movq       mm4,[esi]
-                  pand       mm4,mm0
-                  movq       mm6,mm0
-                  movq       mm7,[ebx]
-                  pandn      mm6,mm7
-                  por        mm4,mm6
-                  movq       [ebx],mm4
-
-                  movq       mm5,[esi+8]
-                  pand       mm5,mm1
-                  movq       mm7,mm1
-                  movq       mm6,[ebx+8]
-                  pandn      mm7,mm6
-                  por        mm5,mm7
-                  movq       [ebx+8],mm5
-
-                  add        esi,16            //inc by 16 bytes processed
-                  add        ebx,16
-                  sub        ecx,8             //dec by 8 pixels processed
-
-                  ja         mainloop16
-
-mainloop16end:
-                  mov        ecx,diff
-                  cmp        ecx,0
-                  jz         end16
-
-                  mov        edx,mask
-                  sal        edx,24            //make low byte the high byte
-secondloop16:
-                  sal        edx,1             //move high bit to CF
-                  jnc        skip16            //if CF = 0
-                  mov        ax,[esi]
-                  mov        [ebx],ax
-skip16:
-                  add        esi,2
-                  add        ebx,2
-
-                  dec        ecx
-                  jnz        secondloop16
-end16:
-                  emms
-               }
-            }
-            else /* mmx not supported - use modified C routine */
-            {
-               register unsigned int incr1, initial_val, final_val;
-               png_size_t pixel_bytes;
-               png_uint_32 i;
-               register int disp = png_pass_inc[png_ptr->pass];
-
-               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
-               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
-                  pixel_bytes;
-               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
-               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
-               final_val = png_ptr->width*pixel_bytes;
-               incr1 = (disp)*pixel_bytes;
-               for (i = initial_val; i < final_val; i += incr1)
-               {
-                  png_memcpy(dstptr, srcptr, pixel_bytes);
-                  srcptr += incr1;
-                  dstptr += incr1;
-               }
-            } /* end of else */
-
-            break;
-         }       // end 16 bpp
-
-         case 48:
-         {
-            png_bytep srcptr;
-            png_bytep dstptr;
-            png_uint_32 len;
-            int unmask, diff;
-
-            static PNG_CONST __int64 bpp48_mask5=0x0101010101010202,
-                                     bpp48_mask4=0x0202020204040404,
-                                     bpp48_mask3=0x0404080808080808,
-                                     bpp48_mask2=0x1010101010102020,
-                                     bpp48_mask1=0x2020202040404040,
-                                     bpp48_mask0=0x4040808080808080;
-
-#if !defined(PNG_1_0_X)
-            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
-                /* && mmx_supported */ )
-#else
-            if (mmx_supported)
-#endif
-            {
-               srcptr = png_ptr->row_buf + 1;
-               dstptr = row;
-
-               unmask = ~mask;
-               len     = (png_ptr->width)&~7;
-               diff = (png_ptr->width)&7;
-               _asm
-               {
-                  movd       mm7, unmask       //load bit pattern
-                  psubb      mm6,mm6           //zero mm6
-                  punpcklbw  mm7,mm7
-                  punpcklwd  mm7,mm7
-                  punpckldq  mm7,mm7           //fill register with 8 masks
-
-                  movq       mm0,bpp48_mask0
-                  movq       mm1,bpp48_mask1
-                  movq       mm2,bpp48_mask2
-                  movq       mm3,bpp48_mask3
-                  movq       mm4,bpp48_mask4
-                  movq       mm5,bpp48_mask5
-
-                  pand       mm0,mm7
-                  pand       mm1,mm7
-                  pand       mm2,mm7
-                  pand       mm3,mm7
-                  pand       mm4,mm7
-                  pand       mm5,mm7
-
-                  pcmpeqb    mm0,mm6
-                  pcmpeqb    mm1,mm6
-                  pcmpeqb    mm2,mm6
-                  pcmpeqb    mm3,mm6
-                  pcmpeqb    mm4,mm6
-                  pcmpeqb    mm5,mm6
-
-                  mov        ecx,len           //load length of line
-                  mov        esi,srcptr        //load source
-                  mov        ebx,dstptr        //load dest
-
-                  cmp        ecx,0
-                  jz         mainloop48end
-
-mainloop48:
-                  movq       mm7,[esi]
-                  pand       mm7,mm0
-                  movq       mm6,mm0
-                  pandn      mm6,[ebx]
-                  por        mm7,mm6
-                  movq       [ebx],mm7
-
-                  movq       mm6,[esi+8]
-                  pand       mm6,mm1
-                  movq       mm7,mm1
-                  pandn      mm7,[ebx+8]
-                  por        mm6,mm7
-                  movq       [ebx+8],mm6
-
-                  movq       mm6,[esi+16]
-                  pand       mm6,mm2
-                  movq       mm7,mm2
-                  pandn      mm7,[ebx+16]
-                  por        mm6,mm7
-                  movq       [ebx+16],mm6
-
-                  movq       mm7,[esi+24]
-                  pand       mm7,mm3
-                  movq       mm6,mm3
-                  pandn      mm6,[ebx+24]
-                  por        mm7,mm6
-                  movq       [ebx+24],mm7
-
-                  movq       mm6,[esi+32]
-                  pand       mm6,mm4
-                  movq       mm7,mm4
-                  pandn      mm7,[ebx+32]
-                  por        mm6,mm7
-                  movq       [ebx+32],mm6
-
-                  movq       mm7,[esi+40]
-                  pand       mm7,mm5
-                  movq       mm6,mm5
-                  pandn      mm6,[ebx+40]
-                  por        mm7,mm6
-                  movq       [ebx+40],mm7
-
-                  add        esi,48            //inc by 32 bytes processed
-                  add        ebx,48
-                  sub        ecx,8             //dec by 8 pixels processed
-
-                  ja         mainloop48
-mainloop48end:
-
-                  mov        ecx,diff
-                  cmp        ecx,0
-                  jz         end48
-
-                  mov        edx,mask
-                  sal        edx,24            //make low byte the high byte
-
-secondloop48:
-                  sal        edx,1             //move high bit to CF
-                  jnc        skip48            //if CF = 0
-                  mov        eax,[esi]
-                  mov        [ebx],eax
-                  mov        ax,[esi+4]       // These 2 lines added 20070717
-                  mov        [ebx+4],ax       // Glenn R-P
-skip48:
-                  add        esi,6            // Changed 4 to 6 on these 2
-                  add        ebx,6            // lines.  Glenn R-P 20070717
-
-                  dec        ecx
-                  jnz        secondloop48
-
-end48:
-                  emms
-               }
-            }
-            else /* mmx _not supported - Use modified C routine */
-            {
-               register unsigned int incr1, initial_val, final_val;
-               png_size_t pixel_bytes;
-               png_uint_32 i;
-               register int disp = png_pass_inc[png_ptr->pass];
-
-               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
-               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
-                  pixel_bytes;
-               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
-               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
-               final_val = png_ptr->width*pixel_bytes;
-               incr1 = (disp)*pixel_bytes;
-               for (i = initial_val; i < final_val; i += incr1)
-               {
-                  png_memcpy(dstptr, srcptr, pixel_bytes);
-                  srcptr += incr1;
-                  dstptr += incr1;
-               }
-            } /* end of else */
-
-            break;
-         }       // end 48 bpp
-
-         default:
-         {
-            png_bytep sptr;
-            png_bytep dp;
-            png_size_t pixel_bytes;
-            unsigned int i;
-            register int disp = png_pass_inc[png_ptr->pass];  // get the offset
-            register unsigned int incr1, initial_val, final_val;
-
-            pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
-            sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
-               pixel_bytes;
-            dp = row + offset_table[png_ptr->pass]*pixel_bytes;
-            initial_val = offset_table[png_ptr->pass]*pixel_bytes;
-            final_val = png_ptr->width*pixel_bytes;
-            incr1 = (disp)*pixel_bytes;
-            for (i = initial_val; i < final_val; i += incr1)
-            {
-               png_memcpy(dp, sptr, pixel_bytes);
-               sptr += incr1;
-               dp += incr1;
-            }
-            break;
-         }
-      } /* end switch (png_ptr->row_info.pixel_depth) */
-   } /* end if (non-trivial mask) */
-
-} /* end png_combine_row() */
-
-
-#if defined(PNG_READ_INTERLACING_SUPPORTED)
-
-void /* PRIVATE */
-png_do_read_interlace(png_structp png_ptr)
-{
-   png_row_infop row_info = &(png_ptr->row_info);
-   png_bytep row = png_ptr->row_buf + 1;
-   int pass = png_ptr->pass;
-   png_uint_32 transformations = png_ptr->transformations;
-
-   png_debug(1,"in png_do_read_interlace\n");
-
-   if (mmx_supported == 2) {
-#if !defined(PNG_1_0_X)
-       /* this should have happened in png_init_mmx_flags() already */
-       png_warning(png_ptr, "asm_flags may not have been initialized");
-#endif
-       png_mmx_support();
-   }
-
-   if (row != NULL && row_info != NULL)
-   {
-      png_uint_32 final_width;
-
-      final_width = row_info->width * png_pass_inc[pass];
-
-      switch (row_info->pixel_depth)
-      {
-         case 1:
-         {
-            png_bytep sp, dp;
-            int sshift, dshift;
-            int s_start, s_end, s_inc;
-            png_byte v;
-            png_uint_32 i;
-            int j;
-
-            sp = row + (png_size_t)((row_info->width - 1) >> 3);
-            dp = row + (png_size_t)((final_width - 1) >> 3);
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (transformations & PNG_PACKSWAP)
-            {
-               sshift = (int)((row_info->width + 7) & 7);
-               dshift = (int)((final_width + 7) & 7);
-               s_start = 7;
-               s_end = 0;
-               s_inc = -1;
-            }
-            else
-#endif
-            {
-               sshift = 7 - (int)((row_info->width + 7) & 7);
-               dshift = 7 - (int)((final_width + 7) & 7);
-               s_start = 0;
-               s_end = 7;
-               s_inc = 1;
-            }
-
-            for (i = row_info->width; i; i--)
-            {
-               v = (png_byte)((*sp >> sshift) & 0x1);
-               for (j = 0; j < png_pass_inc[pass]; j++)
-               {
-                  *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
-                  *dp |= (png_byte)(v << dshift);
-                  if (dshift == s_end)
-                  {
-                     dshift = s_start;
-                     dp--;
-                  }
-                  else
-                     dshift += s_inc;
-               }
-               if (sshift == s_end)
-               {
-                  sshift = s_start;
-                  sp--;
-               }
-               else
-                  sshift += s_inc;
-            }
-            break;
-         }
-
-         case 2:
-         {
-            png_bytep sp, dp;
-            int sshift, dshift;
-            int s_start, s_end, s_inc;
-            png_uint_32 i;
-
-            sp = row + (png_size_t)((row_info->width - 1) >> 2);
-            dp = row + (png_size_t)((final_width - 1) >> 2);
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (transformations & PNG_PACKSWAP)
-            {
-               sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
-               dshift = (png_size_t)(((final_width + 3) & 3) << 1);
-               s_start = 6;
-               s_end = 0;
-               s_inc = -2;
-            }
-            else
-#endif
-            {
-               sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
-               dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
-               s_start = 0;
-               s_end = 6;
-               s_inc = 2;
-            }
-
-            for (i = row_info->width; i; i--)
-            {
-               png_byte v;
-               int j;
-
-               v = (png_byte)((*sp >> sshift) & 0x3);
-               for (j = 0; j < png_pass_inc[pass]; j++)
-               {
-                  *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
-                  *dp |= (png_byte)(v << dshift);
-                  if (dshift == s_end)
-                  {
-                     dshift = s_start;
-                     dp--;
-                  }
-                  else
-                     dshift += s_inc;
-               }
-               if (sshift == s_end)
-               {
-                  sshift = s_start;
-                  sp--;
-               }
-               else
-                  sshift += s_inc;
-            }
-            break;
-         }
-
-         case 4:
-         {
-            png_bytep sp, dp;
-            int sshift, dshift;
-            int s_start, s_end, s_inc;
-            png_uint_32 i;
-
-            sp = row + (png_size_t)((row_info->width - 1) >> 1);
-            dp = row + (png_size_t)((final_width - 1) >> 1);
-#if defined(PNG_READ_PACKSWAP_SUPPORTED)
-            if (transformations & PNG_PACKSWAP)
-            {
-               sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
-               dshift = (png_size_t)(((final_width + 1) & 1) << 2);
-               s_start = 4;
-               s_end = 0;
-               s_inc = -4;
-            }
-            else
-#endif
-            {
-               sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
-               dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
-               s_start = 0;
-               s_end = 4;
-               s_inc = 4;
-            }
-
-            for (i = row_info->width; i; i--)
-            {
-               png_byte v;
-               int j;
-
-               v = (png_byte)((*sp >> sshift) & 0xf);
-               for (j = 0; j < png_pass_inc[pass]; j++)
-               {
-                  *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
-                  *dp |= (png_byte)(v << dshift);
-                  if (dshift == s_end)
-                  {
-                     dshift = s_start;
-                     dp--;
-                  }
-                  else
-                     dshift += s_inc;
-               }
-               if (sshift == s_end)
-               {
-                  sshift = s_start;
-                  sp--;
-               }
-               else
-                  sshift += s_inc;
-            }
-            break;
-         }
-
-         default:         // This is the place where the routine is modified
-         {
-            static PNG_CONST __int64 const4 = 0x0000000000FFFFFF;
-            // static PNG_CONST __int64 const5 = 0x000000FFFFFF0000;  // unused...
-            static PNG_CONST __int64 const6 = 0x00000000000000FF;
-            png_bytep sptr, dp;
-            png_uint_32 i;
-            png_size_t pixel_bytes;
-            int width = row_info->width;
-
-            pixel_bytes = (row_info->pixel_depth >> 3);
-
-            sptr = row + (width - 1) * pixel_bytes;
-            dp = row + (final_width - 1) * pixel_bytes;
-            // New code by Nirav Chhatrapati - Intel Corporation
-            // sign fix by GRR
-            // NOTE:  there is NO MMX code for 48-bit and 64-bit images
-
-            // use MMX routine if machine supports it
-#if !defined(PNG_1_0_X)
-            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
-                /* && mmx_supported */ )
-#else
-            if (mmx_supported)
-#endif
-            {
-               if (pixel_bytes == 3)
-               {
-                  if (((pass == 4) || (pass == 5)) && width)
-                  {
-                     int width_mmx = ((width >> 1) << 1) - 8;
-                     if (width_mmx < 0)
-                         width_mmx = 0;
-                     width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
-                     if (width_mmx)
-                     {
-                        _asm
-                        {
-                           mov esi, sptr
-                           mov edi, dp
-                           mov ecx, width_mmx
-                           sub esi, 3
-                           sub edi, 9
-loop_pass4:
-                           movq mm0, [esi]     ; X X v2 v1 v0 v5 v4 v3
-                           movq mm7, mm0       ; X X v2 v1 v0 v5 v4 v3
-                           movq mm6, mm0       ; X X v2 v1 v0 v5 v4 v3
-                           psllq mm0, 24       ; v1 v0 v5 v4 v3 0 0 0
-                           pand mm7, const4    ; 0 0 0 0 0 v5 v4 v3
-                           psrlq mm6, 24       ; 0 0 0 X X v2 v1 v0
-                           por mm0, mm7        ; v1 v0 v5 v4 v3 v5 v4 v3
-                           movq mm5, mm6       ; 0 0 0 X X v2 v1 v0
-                           psllq mm6, 8        ; 0 0 X X v2 v1 v0 0
-                           movq [edi], mm0     ; move quad to memory
-                           psrlq mm5, 16       ; 0 0 0 0 0 X X v2
-                           pand mm5, const6    ; 0 0 0 0 0 0 0 v2
-                           por mm6, mm5        ; 0 0 X X v2 v1 v0 v2
-                           movd [edi+8], mm6   ; move double to memory
-                           sub esi, 6
-                           sub edi, 12
-                           sub ecx, 2
-                           jnz loop_pass4
-                           EMMS
-                        }
-                     }
-
-                     sptr -= width_mmx*3;
-                     dp -= width_mmx*6;
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-
-                        png_memcpy(v, sptr, 3);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           png_memcpy(dp, v, 3);
-                           dp -= 3;
-                        }
-                        sptr -= 3;
-                     }
-                  }
-                  else if (((pass == 2) || (pass == 3)) && width)
-                  {
-                     _asm
-                     {
-                        mov esi, sptr
-                        mov edi, dp
-                        mov ecx, width
-                        sub edi, 9   // (png_pass_inc[pass] - 1)*pixel_bytes
-loop_pass2:
-                        movd mm0, [esi]     ; X X X X X v2 v1 v0
-                        pand mm0, const4    ; 0 0 0 0 0 v2 v1 v0
-                        movq mm1, mm0       ; 0 0 0 0 0 v2 v1 v0
-                        psllq mm0, 16       ; 0 0 0 v2 v1 v0 0 0
-                        movq mm2, mm0       ; 0 0 0 v2 v1 v0 0 0
-                        psllq mm0, 24       ; v2 v1 v0 0 0 0 0 0
-                        psrlq mm1, 8        ; 0 0 0 0 0 0 v2 v1
-                        por mm0, mm2        ; v2 v1 v0 v2 v1 v0 0 0
-                        por mm0, mm1        ; v2 v1 v0 v2 v1 v0 v2 v1
-                        movq [edi+4], mm0   ; move to memory
-                        psrlq mm0, 16       ; 0 0 v2 v1 v0 v2 v1 v0
-                        movd [edi], mm0     ; move to memory
-                        sub esi, 3
-                        sub edi, 12
-                        dec ecx
-                        jnz loop_pass2
-                        EMMS
-                     }
-                  }
-                  else if (width) /* && ((pass == 0) || (pass == 1))) */
-                  {
-                     _asm
-                     {
-                        mov esi, sptr
-                        mov edi, dp
-                        mov ecx, width
-                        sub edi, 21   // (png_pass_inc[pass] - 1)*pixel_bytes
-loop_pass0:
-                        movd mm0, [esi]     ; X X X X X v2 v1 v0
-                        pand mm0, const4    ; 0 0 0 0 0 v2 v1 v0
-                        movq mm1, mm0       ; 0 0 0 0 0 v2 v1 v0
-                        psllq mm0, 16       ; 0 0 0 v2 v1 v0 0 0
-                        movq mm2, mm0       ; 0 0 0 v2 v1 v0 0 0
-                        psllq mm0, 24       ; v2 v1 v0 0 0 0 0 0
-                        psrlq mm1, 8        ; 0 0 0 0 0 0 v2 v1
-                        por mm0, mm2        ; v2 v1 v0 v2 v1 v0 0 0
-                        por mm0, mm1        ; v2 v1 v0 v2 v1 v0 v2 v1
-                        movq mm3, mm0       ; v2 v1 v0 v2 v1 v0 v2 v1
-                        psllq mm0, 16       ; v0 v2 v1 v0 v2 v1 0 0
-                        movq mm4, mm3       ; v2 v1 v0 v2 v1 v0 v2 v1
-                        punpckhdq mm3, mm0  ; v0 v2 v1 v0 v2 v1 v0 v2
-                        movq [edi+16] , mm4
-                        psrlq mm0, 32       ; 0 0 0 0 v0 v2 v1 v0
-                        movq [edi+8] , mm3
-                        punpckldq mm0, mm4  ; v1 v0 v2 v1 v0 v2 v1 v0
-                        sub esi, 3
-                        movq [edi], mm0
-                        sub edi, 24
-                        //sub esi, 3
-                        dec ecx
-                        jnz loop_pass0
-                        EMMS
-                     }
-                  }
-               } /* end of pixel_bytes == 3 */
-
-               else if (pixel_bytes == 1)
-               {
-                  if (((pass == 4) || (pass == 5)) && width)
-                  {
-                     int width_mmx = ((width >> 3) << 3);
-                     width -= width_mmx;
-                     if (width_mmx)
-                     {
-                        _asm
-                        {
-                           mov esi, sptr
-                           mov edi, dp
-                           mov ecx, width_mmx
-                           sub edi, 15
-                           sub esi, 7
-loop1_pass4:
-                           movq mm0, [esi]     ; v0 v1 v2 v3 v4 v5 v6 v7
-                           movq mm1, mm0       ; v0 v1 v2 v3 v4 v5 v6 v7
-                           punpcklbw mm0, mm0  ; v4 v4 v5 v5 v6 v6 v7 v7
-                           //movq mm1, mm0     ; v0 v0 v1 v1 v2 v2 v3 v3
-                           punpckhbw mm1, mm1  ;v0 v0 v1 v1 v2 v2 v3 v3
-                           movq [edi+8], mm1   ; move to memory v0 v1 v2 and v3
-                           sub esi, 8
-                           movq [edi], mm0     ; move to memory v4 v5 v6 and v7
-                           //sub esi, 4
-                           sub edi, 16
-                           sub ecx, 8
-                           jnz loop1_pass4
-                           EMMS
-                        }
-                     }
-
-                     sptr -= width_mmx;
-                     dp -= width_mmx*2;
-                     for (i = width; i; i--)
-                     {
-                        int j;
-
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           *dp-- = *sptr;
-                        }
-                        sptr --;
-                     }
-                  }
-                  else if (((pass == 2) || (pass == 3)) && width)
-                  {
-                     int width_mmx = ((width >> 2) << 2);
-                     width -= width_mmx;
-                     if (width_mmx)
-                     {
-                        _asm
-                        {
-                           mov esi, sptr
-                           mov edi, dp
-                           mov ecx, width_mmx
-                           sub edi, 15
-                           sub esi, 3
-loop1_pass2:
-                           movd mm0, [esi]     ; X X X X v0 v1 v2 v3
-                           punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
-                           movq mm1, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
-                           punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
-                           punpckhwd mm1, mm1  ; v0 v0 v0 v0 v1 v1 v1 v1
-                           movq [edi], mm0     ; move to memory v2 and v3
-                           sub esi, 4
-                           movq [edi+8], mm1   ; move to memory v1     and v0
-                           sub edi, 16
-                           sub ecx, 4
-                           jnz loop1_pass2
-                           EMMS
-                        }
-                     }
-
-                     sptr -= width_mmx;
-                     dp -= width_mmx*4;
-                     for (i = width; i; i--)
-                     {
-                        int j;
-
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           *dp-- = *sptr;
-                        }
-                        sptr --;
-                     }
-                  }
-                  else if (width) /* && ((pass == 0) || (pass == 1))) */
-                  {
-                     int width_mmx = ((width >> 2) << 2);
-                     width -= width_mmx;
-                     if (width_mmx)
-                     {
-                        _asm
-                        {
-                           mov esi, sptr
-                           mov edi, dp
-                           mov ecx, width_mmx
-                           sub edi, 31
-                           sub esi, 3
-loop1_pass0:
-                           movd mm0, [esi]     ; X X X X v0 v1 v2 v3
-                           movq mm1, mm0       ; X X X X v0 v1 v2 v3
-                           punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
-                           movq mm2, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
-                           punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
-                           movq mm3, mm0       ; v2 v2 v2 v2 v3 v3 v3 v3
-                           punpckldq mm0, mm0  ; v3 v3 v3 v3 v3 v3 v3 v3
-                           punpckhdq mm3, mm3  ; v2 v2 v2 v2 v2 v2 v2 v2
-                           movq [edi], mm0     ; move to memory v3
-                           punpckhwd mm2, mm2  ; v0 v0 v0 v0 v1 v1 v1 v1
-                           movq [edi+8], mm3   ; move to memory v2
-                           movq mm4, mm2       ; v0 v0 v0 v0 v1 v1 v1 v1
-                           punpckldq mm2, mm2  ; v1 v1 v1 v1 v1 v1 v1 v1
-                           punpckhdq mm4, mm4  ; v0 v0 v0 v0 v0 v0 v0 v0
-                           movq [edi+16], mm2  ; move to memory v1
-                           movq [edi+24], mm4  ; move to memory v0
-                           sub esi, 4
-                           sub edi, 32
-                           sub ecx, 4
-                           jnz loop1_pass0
-                           EMMS
-                        }
-                     }
-
-                     sptr -= width_mmx;
-                     dp -= width_mmx*8;
-                     for (i = width; i; i--)
-                     {
-                        int j;
-
-                       /* I simplified this part in version 1.0.4e
-                        * here and in several other instances where
-                        * pixel_bytes == 1  -- GR-P
-                        *
-                        * Original code:
-                        *
-                        * png_byte v[8];
-                        * png_memcpy(v, sptr, pixel_bytes);
-                        * for (j = 0; j < png_pass_inc[pass]; j++)
-                        * {
-                        *    png_memcpy(dp, v, pixel_bytes);
-                        *    dp -= pixel_bytes;
-                        * }
-                        * sptr -= pixel_bytes;
-                        *
-                        * Replacement code is in the next three lines:
-                        */
-
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                           *dp-- = *sptr;
-                        sptr--;
-                     }
-                  }
-               } /* end of pixel_bytes == 1 */
-
-               else if (pixel_bytes == 2)
-               {
-                  if (((pass == 4) || (pass == 5)) && width)
-                  {
-                     int width_mmx = ((width >> 1) << 1) ;
-                     width -= width_mmx;
-                     if (width_mmx)
-                     {
-                        _asm
-                        {
-                           mov esi, sptr
-                           mov edi, dp
-                           mov ecx, width_mmx
-                           sub esi, 2
-                           sub edi, 6
-loop2_pass4:
-                           movd mm0, [esi]        ; X X X X v1 v0 v3 v2
-                           punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
-                           sub esi, 4
-                           movq [edi], mm0
-                           sub edi, 8
-                           sub ecx, 2
-                           jnz loop2_pass4
-                           EMMS
-                        }
-                     }
-
-                     sptr -= (width_mmx*2 - 2);            // sign fixed
-                     dp -= (width_mmx*4 - 2);            // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= 2;
-                        png_memcpy(v, sptr, 2);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= 2;
-                           png_memcpy(dp, v, 2);
-                        }
-                     }
-                  }
-                  else if (((pass == 2) || (pass == 3)) && width)
-                  {
-                     int width_mmx = ((width >> 1) << 1) ;
-                     width -= width_mmx;
-                     if (width_mmx)
-                     {
-                        _asm
-                        {
-                           mov esi, sptr
-                           mov edi, dp
-                           mov ecx, width_mmx
-                           sub esi, 2
-                           sub edi, 14
-loop2_pass2:
-                           movd mm0, [esi]        ; X X X X v1 v0 v3 v2
-                           punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
-                           movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
-                           punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
-                           punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
-                           movq [edi], mm0
-                           sub esi, 4
-                           movq [edi + 8], mm1
-                           //sub esi, 4
-                           sub edi, 16
-                           sub ecx, 2
-                           jnz loop2_pass2
-                           EMMS
-                        }
-                     }
-
-                     sptr -= (width_mmx*2 - 2);            // sign fixed
-                     dp -= (width_mmx*8 - 2);            // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= 2;
-                        png_memcpy(v, sptr, 2);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= 2;
-                           png_memcpy(dp, v, 2);
-                        }
-                     }
-                  }
-                  else if (width) /* && ((pass == 0) || (pass == 1))) */
-                  {
-                     int width_mmx = ((width >> 1) << 1);
-                     width -= width_mmx;
-                     if (width_mmx)
-                     {
-                        _asm
-                        {
-                           mov esi, sptr
-                           mov edi, dp
-                           mov ecx, width_mmx
-                           sub esi, 2
-                           sub edi, 30
-loop2_pass0:
-                           movd mm0, [esi]        ; X X X X v1 v0 v3 v2
-                           punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
-                           movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
-                           punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
-                           punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
-                           movq [edi], mm0
-                           movq [edi + 8], mm0
-                           movq [edi + 16], mm1
-                           movq [edi + 24], mm1
-                           sub esi, 4
-                           sub edi, 32
-                           sub ecx, 2
-                           jnz loop2_pass0
-                           EMMS
-                        }
-                     }
-
-                     sptr -= (width_mmx*2 - 2);            // sign fixed
-                     dp -= (width_mmx*16 - 2);            // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= 2;
-                        png_memcpy(v, sptr, 2);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= 2;
-                           png_memcpy(dp, v, 2);
-                        }
-                     }
-                  }
-               } /* end of pixel_bytes == 2 */
-
-               else if (pixel_bytes == 4)
-               {
-                  if (((pass == 4) || (pass == 5)) && width)
-                  {
-                     int width_mmx = ((width >> 1) << 1) ;
-                     width -= width_mmx;
-                     if (width_mmx)
-                     {
-                        _asm
-                        {
-                           mov esi, sptr
-                           mov edi, dp
-                           mov ecx, width_mmx
-                           sub esi, 4
-                           sub edi, 12
-loop4_pass4:
-                           movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
-                           movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
-                           punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
-                           punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
-                           movq [edi], mm0
-                           sub esi, 8
-                           movq [edi + 8], mm1
-                           sub edi, 16
-                           sub ecx, 2
-                           jnz loop4_pass4
-                           EMMS
-                        }
-                     }
-
-                     sptr -= (width_mmx*4 - 4);          // sign fixed
-                     dp -= (width_mmx*8 - 4);            // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= 4;
-                        png_memcpy(v, sptr, 4);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= 4;
-                           png_memcpy(dp, v, 4);
-                        }
-                     }
-                  }
-                  else if (((pass == 2) || (pass == 3)) && width)
-                  {
-                     int width_mmx = ((width >> 1) << 1) ;
-                     width -= width_mmx;
-                     if (width_mmx)
-                     {
-                        _asm
-                        {
-                           mov esi, sptr
-                           mov edi, dp
-                           mov ecx, width_mmx
-                           sub esi, 4
-                           sub edi, 28
-loop4_pass2:
-                           movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
-                           movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
-                           punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
-                           punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
-                           movq [edi], mm0
-                           movq [edi + 8], mm0
-                           movq [edi+16], mm1
-                           movq [edi + 24], mm1
-                           sub esi, 8
-                           sub edi, 32
-                           sub ecx, 2
-                           jnz loop4_pass2
-                           EMMS
-                        }
-                     }
-
-                     sptr -= (width_mmx*4 - 4);            // sign fixed
-                     dp -= (width_mmx*16 - 4);            // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= 4;
-                        png_memcpy(v, sptr, 4);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= 4;
-                           png_memcpy(dp, v, 4);
-                        }
-                     }
-                  }
-                  else if (width) /* && ((pass == 0) || (pass == 1))) */
-                  {
-                     int width_mmx = ((width >> 1) << 1) ;
-                     width -= width_mmx;
-                     if (width_mmx)
-                     {
-                        _asm
-                        {
-                           mov esi, sptr
-                           mov edi, dp
-                           mov ecx, width_mmx
-                           sub esi, 4
-                           sub edi, 60
-loop4_pass0:
-                           movq mm0, [esi]        ; v3 v2 v1 v0 v7 v6 v5 v4
-                           movq mm1, mm0          ; v3 v2 v1 v0 v7 v6 v5 v4
-                           punpckldq mm0, mm0     ; v7 v6 v5 v4 v7 v6 v5 v4
-                           punpckhdq mm1, mm1     ; v3 v2 v1 v0 v3 v2 v1 v0
-                           movq [edi], mm0
-                           movq [edi + 8], mm0
-                           movq [edi + 16], mm0
-                           movq [edi + 24], mm0
-                           movq [edi+32], mm1
-                           movq [edi + 40], mm1
-                           movq [edi+ 48], mm1
-                           sub esi, 8
-                           movq [edi + 56], mm1
-                           sub edi, 64
-                           sub ecx, 2
-                           jnz loop4_pass0
-                           EMMS
-                        }
-                     }
-
-                     sptr -= (width_mmx*4 - 4);            // sign fixed
-                     dp -= (width_mmx*32 - 4);            // sign fixed
-                     for (i = width; i; i--)
-                     {
-                        png_byte v[8];
-                        int j;
-                        sptr -= 4;
-                        png_memcpy(v, sptr, 4);
-                        for (j = 0; j < png_pass_inc[pass]; j++)
-                        {
-                           dp -= 4;
-                           png_memcpy(dp, v, 4);
-                        }
-                     }
-                  }
-
-               } /* end of pixel_bytes == 4 */
-
-               else if (pixel_bytes == 6)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, 6);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, 6);
-                        dp -= 6;
-                     }
-                     sptr -= 6;
-                  }
-               } /* end of pixel_bytes == 6 */
-
-               else
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, pixel_bytes);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, pixel_bytes);
-                        dp -= pixel_bytes;
-                     }
-                     sptr-= pixel_bytes;
-                  }
-               }
-            } /* end of mmx_supported */
-
-            else /* MMX not supported:  use modified C code - takes advantage
-                  * of inlining of memcpy for a constant */
-            {
-               if (pixel_bytes == 1)
-               {
-                  for (i = width; i; i--)
-                  {
-                     int j;
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                        *dp-- = *sptr;
-                     sptr--;
-                  }
-               }
-               else if (pixel_bytes == 3)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, pixel_bytes);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, pixel_bytes);
-                        dp -= pixel_bytes;
-                     }
-                     sptr -= pixel_bytes;
-                  }
-               }
-               else if (pixel_bytes == 2)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, pixel_bytes);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, pixel_bytes);
-                        dp -= pixel_bytes;
-                     }
-                     sptr -= pixel_bytes;
-                  }
-               }
-               else if (pixel_bytes == 4)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, pixel_bytes);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, pixel_bytes);
-                        dp -= pixel_bytes;
-                     }
-                     sptr -= pixel_bytes;
-                  }
-               }
-               else if (pixel_bytes == 6)
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, pixel_bytes);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, pixel_bytes);
-                        dp -= pixel_bytes;
-                     }
-                     sptr -= pixel_bytes;
-                  }
-               }
-               else
-               {
-                  for (i = width; i; i--)
-                  {
-                     png_byte v[8];
-                     int j;
-                     png_memcpy(v, sptr, pixel_bytes);
-                     for (j = 0; j < png_pass_inc[pass]; j++)
-                     {
-                        png_memcpy(dp, v, pixel_bytes);
-                        dp -= pixel_bytes;
-                     }
-                     sptr -= pixel_bytes;
-                  }
-               }
-
-            } /* end of MMX not supported */
-            break;
-         }
-      } /* end switch (row_info->pixel_depth) */
-
-      row_info->width = final_width;
-
-      row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
-   }
-
-}
-
-#endif /* PNG_READ_INTERLACING_SUPPORTED */
-
-
-// These global constants are declared
-// here to ensure alignment on 8-byte boundaries.
-  union uAll {
-     __int64 use;
-     double  double_align;
-     long long long_long_align;
-  } ;
-  static PNG_CONST union uAll LBCarryMask = {0x0101010101010101},
-                              HBClearMask = {0x7f7f7f7f7f7f7f7f};
-
-// Optimized code for PNG Average filter decoder
-void /* PRIVATE */
-png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
-                            , png_bytep prev_row)
-{
-  // These variables are declared
-  // here to ensure alignment on 8-byte boundaries.
-  union uAll ActiveMask, ShiftBpp, ShiftRem;
-
-   int bpp;
-   png_uint_32 FullLength;
-   png_uint_32 MMXLength;
-   //png_uint_32 len;
-   int diff;
-
-   bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
-   FullLength  = row_info->rowbytes; // # of bytes to filter
-   _asm {
-         // Init address pointers and offset
-         mov edi, row          // edi ==> Avg(x)
-         xor ebx, ebx          // ebx ==> x
-         mov edx, edi
-         mov esi, prev_row           // esi ==> Prior(x)
-         sub edx, bpp          // edx ==> Raw(x-bpp)
-
-         xor eax, eax
-         // Compute the Raw value for the first bpp bytes
-         //    Raw(x) = Avg(x) + (Prior(x)/2)
-davgrlp:
-         mov al, [esi + ebx]   // Load al with Prior(x)
-         inc ebx
-         shr al, 1             // divide by 2
-         add al, [edi+ebx-1]   // Add Avg(x); -1 to offset inc ebx
-         cmp ebx, bpp
-         mov [edi+ebx-1], al    // Write back Raw(x);
-                            // mov does not affect flags; -1 to offset inc ebx
-         jb davgrlp
-         // get # of bytes to alignment
-         mov diff, edi         // take start of row
-         add diff, ebx         // add bpp
-         add diff, 0xf         // add 7 + 8 to incr past alignment boundary
-         and diff, 0xfffffff8  // mask to alignment boundary
-         sub diff, edi         // subtract from start ==> value ebx at alignment
-         jz davggo
-         // fix alignment
-         // Compute the Raw value for the bytes upto the alignment boundary
-         //    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
-         xor ecx, ecx
-davglp1:
-         xor eax, eax
-         mov cl, [esi + ebx]        // load cl with Prior(x)
-         mov al, [edx + ebx]  // load al with Raw(x-bpp)
-         add ax, cx
-         inc ebx
-         shr ax, 1            // divide by 2
-         add al, [edi+ebx-1]  // Add Avg(x); -1 to offset inc ebx
-         cmp ebx, diff              // Check if at alignment boundary
-         mov [edi+ebx-1], al        // Write back Raw(x);
-                            // mov does not affect flags; -1 to offset inc ebx
-         jb davglp1               // Repeat until at alignment boundary
-davggo:
-         mov eax, FullLength
-         mov ecx, eax
-         sub eax, ebx          // subtract alignment fix
-         and eax, 0x00000007   // calc bytes over mult of 8
-         sub ecx, eax          // drop over bytes from original length
-         mov MMXLength, ecx
-   } // end _asm block
-   // Now do the math for the rest of the row
-   switch ( bpp )
-   {
-      case 3:
-      {
-         ActiveMask.use  = 0x0000000000ffffff;
-         ShiftBpp.use = 24;    // == 3 * 8
-         ShiftRem.use = 40;    // == 64 - 24
-         _asm {
-            // Re-init address pointers and offset
-            movq mm7, ActiveMask
-            mov ebx, diff      // ebx ==> x = offset to alignment boundary
-            movq mm5, LBCarryMask
-            mov edi, row       // edi ==> Avg(x)
-            movq mm4, HBClearMask
-            mov esi, prev_row        // esi ==> Prior(x)
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
-                               // (we correct position in loop below)
-davg3lp:
-            movq mm0, [edi + ebx]      // Load mm0 with Avg(x)
-            // Add (Prev_row/2) to Average
-            movq mm3, mm5
-            psrlq mm2, ShiftRem      // Correct position Raw(x-bpp) data
-            movq mm1, [esi + ebx]    // Load mm1 with Prior(x)
-            movq mm6, mm7
-            pand mm3, mm1      // get lsb for each prev_row byte
-            psrlq mm1, 1       // divide prev_row bytes by 2
-            pand  mm1, mm4     // clear invalid bit 7 of each byte
-            paddb mm0, mm1     // add (Prev_row/2) to Avg for each byte
-            // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
-            movq mm1, mm3      // now use mm1 for getting LBCarrys
-            pand mm1, mm2      // get LBCarrys for each byte where both
-                               // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1       // divide raw bytes by 2
-            pand  mm2, mm4     // clear invalid bit 7 of each byte
-            paddb mm2, mm1     // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm6      // Leave only Active Group 1 bytes to add to Avg
-            paddb mm0, mm2     // add (Raw/2) + LBCarrys to Avg for each Active
-                               //  byte
-            // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
-            psllq mm6, ShiftBpp  // shift the mm6 mask to cover bytes 3-5
-            movq mm2, mm0        // mov updated Raws to mm2
-            psllq mm2, ShiftBpp  // shift data to position correctly
-            movq mm1, mm3        // now use mm1 for getting LBCarrys
-            pand mm1, mm2      // get LBCarrys for each byte where both
-                               // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1       // divide raw bytes by 2
-            pand  mm2, mm4     // clear invalid bit 7 of each byte
-            paddb mm2, mm1     // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm6      // Leave only Active Group 2 bytes to add to Avg
-            paddb mm0, mm2     // add (Raw/2) + LBCarrys to Avg for each Active
-                               //  byte
-
-            // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
-            psllq mm6, ShiftBpp  // shift the mm6 mask to cover the last two
-                                 // bytes
-            movq mm2, mm0        // mov updated Raws to mm2
-            psllq mm2, ShiftBpp  // shift data to position correctly
-                              // Data only needs to be shifted once here to
-                              // get the correct x-bpp offset.
-            movq mm1, mm3     // now use mm1 for getting LBCarrys
-            pand mm1, mm2     // get LBCarrys for each byte where both
-                              // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1      // divide raw bytes by 2
-            pand  mm2, mm4    // clear invalid bit 7 of each byte
-            paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm6     // Leave only Active Group 2 bytes to add to Avg
-            add ebx, 8
-            paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
-                              // byte
-
-            // Now ready to write back to memory
-            movq [edi + ebx - 8], mm0
-            // Move updated Raw(x) to use as Raw(x-bpp) for next loop
-            cmp ebx, MMXLength
-            movq mm2, mm0     // mov updated Raw(x) to mm2
-            jb davg3lp
-         } // end _asm block
-      }
-      break;
-
-      case 6:
-      case 4:
-      case 7:
-      case 5:
-      {
-         ActiveMask.use  = 0xffffffffffffffff;  // use shift below to clear
-                                                // appropriate inactive bytes
-         ShiftBpp.use = bpp << 3;
-         ShiftRem.use = 64 - ShiftBpp.use;
-         _asm {
-            movq mm4, HBClearMask
-            // Re-init address pointers and offset
-            mov ebx, diff       // ebx ==> x = offset to alignment boundary
-            // Load ActiveMask and clear all bytes except for 1st active group
-            movq mm7, ActiveMask
-            mov edi, row         // edi ==> Avg(x)
-            psrlq mm7, ShiftRem
-            mov esi, prev_row    // esi ==> Prior(x)
-            movq mm6, mm7
-            movq mm5, LBCarryMask
-            psllq mm6, ShiftBpp  // Create mask for 2nd active group
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
-                                 // (we correct position in loop below)
-davg4lp:
-            movq mm0, [edi + ebx]
-            psrlq mm2, ShiftRem  // shift data to position correctly
-            movq mm1, [esi + ebx]
-            // Add (Prev_row/2) to Average
-            movq mm3, mm5
-            pand mm3, mm1     // get lsb for each prev_row byte
-            psrlq mm1, 1      // divide prev_row bytes by 2
-            pand  mm1, mm4    // clear invalid bit 7 of each byte
-            paddb mm0, mm1    // add (Prev_row/2) to Avg for each byte
-            // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
-            movq mm1, mm3     // now use mm1 for getting LBCarrys
-            pand mm1, mm2     // get LBCarrys for each byte where both
-                              // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1      // divide raw bytes by 2
-            pand  mm2, mm4    // clear invalid bit 7 of each byte
-            paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm7     // Leave only Active Group 1 bytes to add to Avg
-            paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
-                              // byte
-            // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
-            movq mm2, mm0     // mov updated Raws to mm2
-            psllq mm2, ShiftBpp // shift data to position correctly
-            add ebx, 8
-            movq mm1, mm3     // now use mm1 for getting LBCarrys
-            pand mm1, mm2     // get LBCarrys for each byte where both
-                              // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1      // divide raw bytes by 2
-            pand  mm2, mm4    // clear invalid bit 7 of each byte
-            paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm6     // Leave only Active Group 2 bytes to add to Avg
-            paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
-                              // byte
-            cmp ebx, MMXLength
-            // Now ready to write back to memory
-            movq [edi + ebx - 8], mm0
-            // Prep Raw(x-bpp) for next loop
-            movq mm2, mm0     // mov updated Raws to mm2
-            jb davg4lp
-         } // end _asm block
-      }
-      break;
-      case 2:
-      {
-         ActiveMask.use  = 0x000000000000ffff;
-         ShiftBpp.use = 16;   // == 2 * 8     [BUGFIX]
-         ShiftRem.use = 48;   // == 64 - 16   [BUGFIX]
-         _asm {
-            // Load ActiveMask
-            movq mm7, ActiveMask
-            // Re-init address pointers and offset
-            mov ebx, diff     // ebx ==> x = offset to alignment boundary
-            movq mm5, LBCarryMask
-            mov edi, row      // edi ==> Avg(x)
-            movq mm4, HBClearMask
-            mov esi, prev_row  // esi ==> Prior(x)
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
-                              // (we correct position in loop below)
-davg2lp:
-            movq mm0, [edi + ebx]
-            psrlq mm2, ShiftRem  // shift data to position correctly   [BUGFIX]
-            movq mm1, [esi + ebx]
-            // Add (Prev_row/2) to Average
-            movq mm3, mm5
-            pand mm3, mm1     // get lsb for each prev_row byte
-            psrlq mm1, 1      // divide prev_row bytes by 2
-            pand  mm1, mm4    // clear invalid bit 7 of each byte
-            movq mm6, mm7
-            paddb mm0, mm1    // add (Prev_row/2) to Avg for each byte
-            // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
-            movq mm1, mm3     // now use mm1 for getting LBCarrys
-            pand mm1, mm2     // get LBCarrys for each byte where both
-                              // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1      // divide raw bytes by 2
-            pand  mm2, mm4    // clear invalid bit 7 of each byte
-            paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm6     // Leave only Active Group 1 bytes to add to Avg
-            paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
-            // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
-            psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
-            movq mm2, mm0       // mov updated Raws to mm2
-            psllq mm2, ShiftBpp // shift data to position correctly
-            movq mm1, mm3       // now use mm1 for getting LBCarrys
-            pand mm1, mm2       // get LBCarrys for each byte where both
-                                // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1        // divide raw bytes by 2
-            pand  mm2, mm4      // clear invalid bit 7 of each byte
-            paddb mm2, mm1      // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm6       // Leave only Active Group 2 bytes to add to Avg
-            paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
-
-            // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
-            psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
-            movq mm2, mm0       // mov updated Raws to mm2
-            psllq mm2, ShiftBpp // shift data to position correctly
-                                // Data only needs to be shifted once here to
-                                // get the correct x-bpp offset.
-            movq mm1, mm3       // now use mm1 for getting LBCarrys
-            pand mm1, mm2       // get LBCarrys for each byte where both
-                                // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1        // divide raw bytes by 2
-            pand  mm2, mm4      // clear invalid bit 7 of each byte
-            paddb mm2, mm1      // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm6       // Leave only Active Group 2 bytes to add to Avg
-            paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
-
-            // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
-            psllq mm6, ShiftBpp  // shift the mm6 mask to cover bytes 6 & 7
-            movq mm2, mm0        // mov updated Raws to mm2
-            psllq mm2, ShiftBpp  // shift data to position correctly
-                                 // Data only needs to be shifted once here to
-                                 // get the correct x-bpp offset.
-            add ebx, 8
-            movq mm1, mm3    // now use mm1 for getting LBCarrys
-            pand mm1, mm2    // get LBCarrys for each byte where both
-                             // lsb's were == 1 (Only valid for active group)
-            psrlq mm2, 1     // divide raw bytes by 2
-            pand  mm2, mm4   // clear invalid bit 7 of each byte
-            paddb mm2, mm1   // add LBCarrys to (Raw(x-bpp)/2) for each byte
-            pand mm2, mm6    // Leave only Active Group 2 bytes to add to Avg
-            paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
-
-            cmp ebx, MMXLength
-            // Now ready to write back to memory
-            movq [edi + ebx - 8], mm0
-            // Prep Raw(x-bpp) for next loop
-            movq mm2, mm0    // mov updated Raws to mm2
-            jb davg2lp
-        } // end _asm block
-      }
-      break;
-
-      case 1:                 // bpp == 1
-      {
-         _asm {
-            // Re-init address pointers and offset
-            mov ebx, diff     // ebx ==> x = offset to alignment boundary
-            mov edi, row      // edi ==> Avg(x)
-            cmp ebx, FullLength  // Test if offset at end of array
-            jnb davg1end
-            // Do Paeth decode for remaining bytes
-            mov esi, prev_row    // esi ==> Prior(x)
-            mov edx, edi
-            xor ecx, ecx         // zero ecx before using cl & cx in loop below
-            sub edx, bpp         // edx ==> Raw(x-bpp)
-davg1lp:
-            // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
-            xor eax, eax
-            mov cl, [esi + ebx]  // load cl with Prior(x)
-            mov al, [edx + ebx]  // load al with Raw(x-bpp)
-            add ax, cx
-            inc ebx
-            shr ax, 1            // divide by 2
-            add al, [edi+ebx-1]  // Add Avg(x); -1 to offset inc ebx
-            cmp ebx, FullLength  // Check if at end of array
-            mov [edi+ebx-1], al  // Write back Raw(x);
-                         // mov does not affect flags; -1 to offset inc ebx
-            jb davg1lp
-davg1end:
-         } // end _asm block
-      }
-      return;
-
-      case 8:             // bpp == 8
-      {
-         _asm {
-            // Re-init address pointers and offset
-            mov ebx, diff           // ebx ==> x = offset to alignment boundary
-            movq mm5, LBCarryMask
-            mov edi, row            // edi ==> Avg(x)
-            movq mm4, HBClearMask
-            mov esi, prev_row       // esi ==> Prior(x)
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
-                                // (NO NEED to correct position in loop below)
-davg8lp:
-            movq mm0, [edi + ebx]
-            movq mm3, mm5
-            movq mm1, [esi + ebx]
-            add ebx, 8
-            pand mm3, mm1       // get lsb for each prev_row byte
-            psrlq mm1, 1        // divide prev_row bytes by 2
-            pand mm3, mm2       // get LBCarrys for each byte where both
-                                // lsb's were == 1
-            psrlq mm2, 1        // divide raw bytes by 2
-            pand  mm1, mm4      // clear invalid bit 7 of each byte
-            paddb mm0, mm3      // add LBCarrys to Avg for each byte
-            pand  mm2, mm4      // clear invalid bit 7 of each byte
-            paddb mm0, mm1      // add (Prev_row/2) to Avg for each byte
-            paddb mm0, mm2      // add (Raw/2) to Avg for each byte
-            cmp ebx, MMXLength
-            movq [edi + ebx - 8], mm0
-            movq mm2, mm0       // reuse as Raw(x-bpp)
-            jb davg8lp
-        } // end _asm block
-      }
-      break;
-      default:                  // bpp greater than 8
-      {
-        _asm {
-            movq mm5, LBCarryMask
-            // Re-init address pointers and offset
-            mov ebx, diff       // ebx ==> x = offset to alignment boundary
-            mov edi, row        // edi ==> Avg(x)
-            movq mm4, HBClearMask
-            mov edx, edi
-            mov esi, prev_row   // esi ==> Prior(x)
-            sub edx, bpp        // edx ==> Raw(x-bpp)
-davgAlp:
-            movq mm0, [edi + ebx]
-            movq mm3, mm5
-            movq mm1, [esi + ebx]
-            pand mm3, mm1       // get lsb for each prev_row byte
-            movq mm2, [edx + ebx]
-            psrlq mm1, 1        // divide prev_row bytes by 2
-            pand mm3, mm2       // get LBCarrys for each byte where both
-                                // lsb's were == 1
-            psrlq mm2, 1        // divide raw bytes by 2
-            pand  mm1, mm4      // clear invalid bit 7 of each byte
-            paddb mm0, mm3      // add LBCarrys to Avg for each byte
-            pand  mm2, mm4      // clear invalid bit 7 of each byte
-            paddb mm0, mm1      // add (Prev_row/2) to Avg for each byte
-            add ebx, 8
-            paddb mm0, mm2      // add (Raw/2) to Avg for each byte
-            cmp ebx, MMXLength
-            movq [edi + ebx - 8], mm0
-            jb davgAlp
-        } // end _asm block
-      }
-      break;
-   }                         // end switch ( bpp )
-
-   _asm {
-         // MMX acceleration complete now do clean-up
-         // Check if any remaining bytes left to decode
-         mov ebx, MMXLength    // ebx ==> x = offset bytes remaining after MMX
-         mov edi, row          // edi ==> Avg(x)
-         cmp ebx, FullLength   // Test if offset at end of array
-         jnb davgend
-         // Do Paeth decode for remaining bytes
-         mov esi, prev_row     // esi ==> Prior(x)
-         mov edx, edi
-         xor ecx, ecx          // zero ecx before using cl & cx in loop below
-         sub edx, bpp          // edx ==> Raw(x-bpp)
-davglp2:
-         // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
-         xor eax, eax
-         mov cl, [esi + ebx]   // load cl with Prior(x)
-         mov al, [edx + ebx]   // load al with Raw(x-bpp)
-         add ax, cx
-         inc ebx
-         shr ax, 1              // divide by 2
-         add al, [edi+ebx-1]    // Add Avg(x); -1 to offset inc ebx
-         cmp ebx, FullLength    // Check if at end of array
-         mov [edi+ebx-1], al    // Write back Raw(x);
-                          // mov does not affect flags; -1 to offset inc ebx
-         jb davglp2
-davgend:
-         emms             // End MMX instructions; prep for possible FP instrs.
-   } // end _asm block
-}
-
-// Optimized code for PNG Paeth filter decoder
-void /* PRIVATE */
-png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
-                              png_bytep prev_row)
-{
-  // These variables are declared
-  // here to ensure alignment on 8-byte boundaries.
-  union uAll  ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
-
-   png_uint_32 FullLength;
-   png_uint_32 MMXLength;
-   //png_uint_32 len;
-   int bpp;
-   int diff;
-   //int ptemp;
-   int patemp, pbtemp, pctemp;
-
-   bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
-   FullLength  = row_info->rowbytes; // # of bytes to filter
-   _asm
-   {
-         xor ebx, ebx        // ebx ==> x offset
-         mov edi, row
-         xor edx, edx        // edx ==> x-bpp offset
-         mov esi, prev_row
-         xor eax, eax
-
-         // Compute the Raw value for the first bpp bytes
-         // Note: the formula works out to be always
-         //   Paeth(x) = Raw(x) + Prior(x)      where x < bpp
-dpthrlp:
-         mov al, [edi + ebx]
-         add al, [esi + ebx]
-         inc ebx
-         cmp ebx, bpp
-         mov [edi + ebx - 1], al
-         jb dpthrlp
-         // get # of bytes to alignment
-         mov diff, edi         // take start of row
-         add diff, ebx         // add bpp
-         xor ecx, ecx
-         add diff, 0xf         // add 7 + 8 to incr past alignment boundary
-         and diff, 0xfffffff8  // mask to alignment boundary
-         sub diff, edi         // subtract from start ==> value ebx at alignment
-         jz dpthgo
-         // fix alignment
-dpthlp1:
-         xor eax, eax
-         // pav = p - a = (a + b - c) - a = b - c
-         mov al, [esi + ebx]   // load Prior(x) into al
-         mov cl, [esi + edx]   // load Prior(x-bpp) into cl
-         sub eax, ecx          // subtract Prior(x-bpp)
-         mov patemp, eax       // Save pav for later use
-         xor eax, eax
-         // pbv = p - b = (a + b - c) - b = a - c
-         mov al, [edi + edx]   // load Raw(x-bpp) into al
-         sub eax, ecx          // subtract Prior(x-bpp)
-         mov ecx, eax
-         // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-         add eax, patemp       // pcv = pav + pbv
-         // pc = abs(pcv)
-         test eax, 0x80000000
-         jz dpthpca
-         neg eax               // reverse sign of neg values
-dpthpca:
-         mov pctemp, eax       // save pc for later use
-         // pb = abs(pbv)
-         test ecx, 0x80000000
-         jz dpthpba
-         neg ecx               // reverse sign of neg values
-dpthpba:
-         mov pbtemp, ecx       // save pb for later use
-         // pa = abs(pav)
-         mov eax, patemp
-         test eax, 0x80000000
-         jz dpthpaa
-         neg eax               // reverse sign of neg values
-dpthpaa:
-         mov patemp, eax       // save pa for later use
-         // test if pa <= pb
-         cmp eax, ecx
-         jna dpthabb
-         // pa > pb; now test if pb <= pc
-         cmp ecx, pctemp
-         jna dpthbbc
-         // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-         mov cl, [esi + edx]  // load Prior(x-bpp) into cl
-         jmp dpthpaeth
-dpthbbc:
-         // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
-         mov cl, [esi + ebx]   // load Prior(x) into cl
-         jmp dpthpaeth
-dpthabb:
-         // pa <= pb; now test if pa <= pc
-         cmp eax, pctemp
-         jna dpthabc
-         // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-         mov cl, [esi + edx]  // load Prior(x-bpp) into cl
-         jmp dpthpaeth
-dpthabc:
-         // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
-         mov cl, [edi + edx]  // load Raw(x-bpp) into cl
-dpthpaeth:
-         inc ebx
-         inc edx
-         // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
-         add [edi + ebx - 1], cl
-         cmp ebx, diff
-         jb dpthlp1
-dpthgo:
-         mov ecx, FullLength
-         mov eax, ecx
-         sub eax, ebx          // subtract alignment fix
-         and eax, 0x00000007   // calc bytes over mult of 8
-         sub ecx, eax          // drop over bytes from original length
-         mov MMXLength, ecx
-   } // end _asm block
-   // Now do the math for the rest of the row
-   switch ( bpp )
-   {
-      case 3:
-      {
-         ActiveMask.use = 0x0000000000ffffff;
-         ActiveMaskEnd.use = 0xffff000000000000;
-         ShiftBpp.use = 24;    // == bpp(3) * 8
-         ShiftRem.use = 40;    // == 64 - 24
-         _asm
-         {
-            mov ebx, diff
-            mov edi, row
-            mov esi, prev_row
-            pxor mm0, mm0
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            movq mm1, [edi+ebx-8]
-dpth3lp:
-            psrlq mm1, ShiftRem     // shift last 3 bytes to 1st 3 bytes
-            movq mm2, [esi + ebx]   // load b=Prior(x)
-            punpcklbw mm1, mm0      // Unpack High bytes of a
-            movq mm3, [esi+ebx-8]   // Prep c=Prior(x-bpp) bytes
-            punpcklbw mm2, mm0      // Unpack High bytes of b
-            psrlq mm3, ShiftRem     // shift last 3 bytes to 1st 3 bytes
-            // pav = p - a = (a + b - c) - a = b - c
-            movq mm4, mm2
-            punpcklbw mm3, mm0      // Unpack High bytes of c
-            // pbv = p - b = (a + b - c) - b = a - c
-            movq mm5, mm1
-            psubw mm4, mm3
-            pxor mm7, mm7
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            movq mm6, mm4
-            psubw mm5, mm3
-
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4    // Create mask pav bytes < 0
-            paddw mm6, mm5
-            pand mm0, mm4       // Only pav bytes < 0 in mm7
-            pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
-            psubw mm4, mm0
-            pand mm7, mm5       // Only pbv bytes < 0 in mm0
-            psubw mm4, mm0
-            psubw mm5, mm7
-            pxor mm0, mm0
-            pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
-            pand mm0, mm6       // Only pav bytes < 0 in mm7
-            psubw mm5, mm7
-            psubw mm6, mm0
-            //  test pa <= pb
-            movq mm7, mm4
-            psubw mm6, mm0
-            pcmpgtw mm7, mm5    // pa > pb?
-            movq mm0, mm7
-            // use mm7 mask to merge pa & pb
-            pand mm5, mm7
-            // use mm0 mask copy to merge a & b
-            pand mm2, mm0
-            pandn mm7, mm4
-            pandn mm0, mm1
-            paddw mm7, mm5
-            paddw mm0, mm2
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6       // pab > pc?
-            pxor mm1, mm1
-            pand mm3, mm7
-            pandn mm7, mm0
-            paddw mm7, mm3
-            pxor mm0, mm0
-            packuswb mm7, mm1
-            movq mm3, [esi + ebx]   // load c=Prior(x-bpp)
-            pand mm7, ActiveMask
-            movq mm2, mm3           // load b=Prior(x) step 1
-            paddb mm7, [edi + ebx]  // add Paeth predictor with Raw(x)
-            punpcklbw mm3, mm0      // Unpack High bytes of c
-            movq [edi + ebx], mm7   // write back updated value
-            movq mm1, mm7           // Now mm1 will be used as Raw(x-bpp)
-            // Now do Paeth for 2nd set of bytes (3-5)
-            psrlq mm2, ShiftBpp     // load b=Prior(x) step 2
-            punpcklbw mm1, mm0      // Unpack High bytes of a
-            pxor mm7, mm7
-            punpcklbw mm2, mm0      // Unpack High bytes of b
-            // pbv = p - b = (a + b - c) - b = a - c
-            movq mm5, mm1
-            // pav = p - a = (a + b - c) - a = b - c
-            movq mm4, mm2
-            psubw mm5, mm3
-            psubw mm4, mm3
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
-            //       pav + pbv = pbv + pav
-            movq mm6, mm5
-            paddw mm6, mm4
-
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm5       // Create mask pbv bytes < 0
-            pcmpgtw mm7, mm4       // Create mask pav bytes < 0
-            pand mm0, mm5          // Only pbv bytes < 0 in mm0
-            pand mm7, mm4          // Only pav bytes < 0 in mm7
-            psubw mm5, mm0
-            psubw mm4, mm7
-            psubw mm5, mm0
-            psubw mm4, mm7
-            pxor mm0, mm0
-            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
-            pand mm0, mm6          // Only pav bytes < 0 in mm7
-            psubw mm6, mm0
-            //  test pa <= pb
-            movq mm7, mm4
-            psubw mm6, mm0
-            pcmpgtw mm7, mm5       // pa > pb?
-            movq mm0, mm7
-            // use mm7 mask to merge pa & pb
-            pand mm5, mm7
-            // use mm0 mask copy to merge a & b
-            pand mm2, mm0
-            pandn mm7, mm4
-            pandn mm0, mm1
-            paddw mm7, mm5
-            paddw mm0, mm2
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6       // pab > pc?
-            movq mm2, [esi + ebx]  // load b=Prior(x)
-            pand mm3, mm7
-            pandn mm7, mm0
-            pxor mm1, mm1
-            paddw mm7, mm3
-            pxor mm0, mm0
-            packuswb mm7, mm1
-            movq mm3, mm2           // load c=Prior(x-bpp) step 1
-            pand mm7, ActiveMask
-            punpckhbw mm2, mm0      // Unpack High bytes of b
-            psllq mm7, ShiftBpp     // Shift bytes to 2nd group of 3 bytes
-             // pav = p - a = (a + b - c) - a = b - c
-            movq mm4, mm2
-            paddb mm7, [edi + ebx]  // add Paeth predictor with Raw(x)
-            psllq mm3, ShiftBpp     // load c=Prior(x-bpp) step 2
-            movq [edi + ebx], mm7   // write back updated value
-            movq mm1, mm7
-            punpckhbw mm3, mm0      // Unpack High bytes of c
-            psllq mm1, ShiftBpp     // Shift bytes
-                                    // Now mm1 will be used as Raw(x-bpp)
-            // Now do Paeth for 3rd, and final, set of bytes (6-7)
-            pxor mm7, mm7
-            punpckhbw mm1, mm0      // Unpack High bytes of a
-            psubw mm4, mm3
-            // pbv = p - b = (a + b - c) - b = a - c
-            movq mm5, mm1
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            movq mm6, mm4
-            psubw mm5, mm3
-            pxor mm0, mm0
-            paddw mm6, mm5
-
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4    // Create mask pav bytes < 0
-            pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
-            pand mm0, mm4       // Only pav bytes < 0 in mm7
-            pand mm7, mm5       // Only pbv bytes < 0 in mm0
-            psubw mm4, mm0
-            psubw mm5, mm7
-            psubw mm4, mm0
-            psubw mm5, mm7
-            pxor mm0, mm0
-            pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
-            pand mm0, mm6       // Only pav bytes < 0 in mm7
-            psubw mm6, mm0
-            //  test pa <= pb
-            movq mm7, mm4
-            psubw mm6, mm0
-            pcmpgtw mm7, mm5    // pa > pb?
-            movq mm0, mm7
-            // use mm0 mask copy to merge a & b
-            pand mm2, mm0
-            // use mm7 mask to merge pa & pb
-            pand mm5, mm7
-            pandn mm0, mm1
-            pandn mm7, mm4
-            paddw mm0, mm2
-            paddw mm7, mm5
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6    // pab > pc?
-            pand mm3, mm7
-            pandn mm7, mm0
-            paddw mm7, mm3
-            pxor mm1, mm1
-            packuswb mm1, mm7
-            // Step ebx to next set of 8 bytes and repeat loop til done
-            add ebx, 8
-            pand mm1, ActiveMaskEnd
-            paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
-
-            cmp ebx, MMXLength
-            pxor mm0, mm0              // pxor does not affect flags
-            movq [edi + ebx - 8], mm1  // write back updated value
-                                 // mm1 will be used as Raw(x-bpp) next loop
-                           // mm3 ready to be used as Prior(x-bpp) next loop
-            jb dpth3lp
-         } // end _asm block
-      }
-      break;
-
-      case 6:
-      case 7:
-      case 5:
-      {
-         ActiveMask.use  = 0x00000000ffffffff;
-         ActiveMask2.use = 0xffffffff00000000;
-         ShiftBpp.use = bpp << 3;    // == bpp * 8
-         ShiftRem.use = 64 - ShiftBpp.use;
-         _asm
-         {
-            mov ebx, diff
-            mov edi, row
-            mov esi, prev_row
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            movq mm1, [edi+ebx-8]
-            pxor mm0, mm0
-dpth6lp:
-            // Must shift to position Raw(x-bpp) data
-            psrlq mm1, ShiftRem
-            // Do first set of 4 bytes
-            movq mm3, [esi+ebx-8]      // read c=Prior(x-bpp) bytes
-            punpcklbw mm1, mm0      // Unpack Low bytes of a
-            movq mm2, [esi + ebx]   // load b=Prior(x)
-            punpcklbw mm2, mm0      // Unpack Low bytes of b
-            // Must shift to position Prior(x-bpp) data
-            psrlq mm3, ShiftRem
-            // pav = p - a = (a + b - c) - a = b - c
-            movq mm4, mm2
-            punpcklbw mm3, mm0      // Unpack Low bytes of c
-            // pbv = p - b = (a + b - c) - b = a - c
-            movq mm5, mm1
-            psubw mm4, mm3
-            pxor mm7, mm7
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            movq mm6, mm4
-            psubw mm5, mm3
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4    // Create mask pav bytes < 0
-            paddw mm6, mm5
-            pand mm0, mm4       // Only pav bytes < 0 in mm7
-            pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
-            psubw mm4, mm0
-            pand mm7, mm5       // Only pbv bytes < 0 in mm0
-            psubw mm4, mm0
-            psubw mm5, mm7
-            pxor mm0, mm0
-            pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
-            pand mm0, mm6       // Only pav bytes < 0 in mm7
-            psubw mm5, mm7
-            psubw mm6, mm0
-            //  test pa <= pb
-            movq mm7, mm4
-            psubw mm6, mm0
-            pcmpgtw mm7, mm5    // pa > pb?
-            movq mm0, mm7
-            // use mm7 mask to merge pa & pb
-            pand mm5, mm7
-            // use mm0 mask copy to merge a & b
-            pand mm2, mm0
-            pandn mm7, mm4
-            pandn mm0, mm1
-            paddw mm7, mm5
-            paddw mm0, mm2
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6    // pab > pc?
-            pxor mm1, mm1
-            pand mm3, mm7
-            pandn mm7, mm0
-            paddw mm7, mm3
-            pxor mm0, mm0
-            packuswb mm7, mm1
-            movq mm3, [esi + ebx - 8]  // load c=Prior(x-bpp)
-            pand mm7, ActiveMask
-            psrlq mm3, ShiftRem
-            movq mm2, [esi + ebx]      // load b=Prior(x) step 1
-            paddb mm7, [edi + ebx]     // add Paeth predictor with Raw(x)
-            movq mm6, mm2
-            movq [edi + ebx], mm7      // write back updated value
-            movq mm1, [edi+ebx-8]
-            psllq mm6, ShiftBpp
-            movq mm5, mm7
-            psrlq mm1, ShiftRem
-            por mm3, mm6
-            psllq mm5, ShiftBpp
-            punpckhbw mm3, mm0         // Unpack High bytes of c
-            por mm1, mm5
-            // Do second set of 4 bytes
-            punpckhbw mm2, mm0         // Unpack High bytes of b
-            punpckhbw mm1, mm0         // Unpack High bytes of a
-            // pav = p - a = (a + b - c) - a = b - c
-            movq mm4, mm2
-            // pbv = p - b = (a + b - c) - b = a - c
-            movq mm5, mm1
-            psubw mm4, mm3
-            pxor mm7, mm7
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            movq mm6, mm4
-            psubw mm5, mm3
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
-            paddw mm6, mm5
-            pand mm0, mm4          // Only pav bytes < 0 in mm7
-            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
-            psubw mm4, mm0
-            pand mm7, mm5          // Only pbv bytes < 0 in mm0
-            psubw mm4, mm0
-            psubw mm5, mm7
-            pxor mm0, mm0
-            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
-            pand mm0, mm6          // Only pav bytes < 0 in mm7
-            psubw mm5, mm7
-            psubw mm6, mm0
-            //  test pa <= pb
-            movq mm7, mm4
-            psubw mm6, mm0
-            pcmpgtw mm7, mm5       // pa > pb?
-            movq mm0, mm7
-            // use mm7 mask to merge pa & pb
-            pand mm5, mm7
-            // use mm0 mask copy to merge a & b
-            pand mm2, mm0
-            pandn mm7, mm4
-            pandn mm0, mm1
-            paddw mm7, mm5
-            paddw mm0, mm2
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6           // pab > pc?
-            pxor mm1, mm1
-            pand mm3, mm7
-            pandn mm7, mm0
-            pxor mm1, mm1
-            paddw mm7, mm3
-            pxor mm0, mm0
-            // Step ex to next set of 8 bytes and repeat loop til done
-            add ebx, 8
-            packuswb mm1, mm7
-            paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
-            cmp ebx, MMXLength
-            movq [edi + ebx - 8], mm1      // write back updated value
-                                // mm1 will be used as Raw(x-bpp) next loop
-            jb dpth6lp
-         } // end _asm block
-      }
-      break;
-
-      case 4:
-      {
-         ActiveMask.use  = 0x00000000ffffffff;
-         _asm {
-            mov ebx, diff
-            mov edi, row
-            mov esi, prev_row
-            pxor mm0, mm0
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            movq mm1, [edi+ebx-8]    // Only time should need to read
-                                     //  a=Raw(x-bpp) bytes
-dpth4lp:
-            // Do first set of 4 bytes
-            movq mm3, [esi+ebx-8]    // read c=Prior(x-bpp) bytes
-            punpckhbw mm1, mm0       // Unpack Low bytes of a
-            movq mm2, [esi + ebx]    // load b=Prior(x)
-            punpcklbw mm2, mm0       // Unpack High bytes of b
-            // pav = p - a = (a + b - c) - a = b - c
-            movq mm4, mm2
-            punpckhbw mm3, mm0       // Unpack High bytes of c
-            // pbv = p - b = (a + b - c) - b = a - c
-            movq mm5, mm1
-            psubw mm4, mm3
-            pxor mm7, mm7
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            movq mm6, mm4
-            psubw mm5, mm3
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
-            paddw mm6, mm5
-            pand mm0, mm4          // Only pav bytes < 0 in mm7
-            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
-            psubw mm4, mm0
-            pand mm7, mm5          // Only pbv bytes < 0 in mm0
-            psubw mm4, mm0
-            psubw mm5, mm7
-            pxor mm0, mm0
-            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
-            pand mm0, mm6          // Only pav bytes < 0 in mm7
-            psubw mm5, mm7
-            psubw mm6, mm0
-            //  test pa <= pb
-            movq mm7, mm4
-            psubw mm6, mm0
-            pcmpgtw mm7, mm5       // pa > pb?
-            movq mm0, mm7
-            // use mm7 mask to merge pa & pb
-            pand mm5, mm7
-            // use mm0 mask copy to merge a & b
-            pand mm2, mm0
-            pandn mm7, mm4
-            pandn mm0, mm1
-            paddw mm7, mm5
-            paddw mm0, mm2
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6       // pab > pc?
-            pxor mm1, mm1
-            pand mm3, mm7
-            pandn mm7, mm0
-            paddw mm7, mm3
-            pxor mm0, mm0
-            packuswb mm7, mm1
-            movq mm3, [esi + ebx]      // load c=Prior(x-bpp)
-            pand mm7, ActiveMask
-            movq mm2, mm3              // load b=Prior(x) step 1
-            paddb mm7, [edi + ebx]     // add Paeth predictor with Raw(x)
-            punpcklbw mm3, mm0         // Unpack High bytes of c
-            movq [edi + ebx], mm7      // write back updated value
-            movq mm1, mm7              // Now mm1 will be used as Raw(x-bpp)
-            // Do second set of 4 bytes
-            punpckhbw mm2, mm0         // Unpack Low bytes of b
-            punpcklbw mm1, mm0         // Unpack Low bytes of a
-            // pav = p - a = (a + b - c) - a = b - c
-            movq mm4, mm2
-            // pbv = p - b = (a + b - c) - b = a - c
-            movq mm5, mm1
-            psubw mm4, mm3
-            pxor mm7, mm7
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            movq mm6, mm4
-            psubw mm5, mm3
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
-            paddw mm6, mm5
-            pand mm0, mm4          // Only pav bytes < 0 in mm7
-            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
-            psubw mm4, mm0
-            pand mm7, mm5          // Only pbv bytes < 0 in mm0
-            psubw mm4, mm0
-            psubw mm5, mm7
-            pxor mm0, mm0
-            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
-            pand mm0, mm6          // Only pav bytes < 0 in mm7
-            psubw mm5, mm7
-            psubw mm6, mm0
-            //  test pa <= pb
-            movq mm7, mm4
-            psubw mm6, mm0
-            pcmpgtw mm7, mm5       // pa > pb?
-            movq mm0, mm7
-            // use mm7 mask to merge pa & pb
-            pand mm5, mm7
-            // use mm0 mask copy to merge a & b
-            pand mm2, mm0
-            pandn mm7, mm4
-            pandn mm0, mm1
-            paddw mm7, mm5
-            paddw mm0, mm2
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6       // pab > pc?
-            pxor mm1, mm1
-            pand mm3, mm7
-            pandn mm7, mm0
-            pxor mm1, mm1
-            paddw mm7, mm3
-            pxor mm0, mm0
-            // Step ex to next set of 8 bytes and repeat loop til done
-            add ebx, 8
-            packuswb mm1, mm7
-            paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
-            cmp ebx, MMXLength
-            movq [edi + ebx - 8], mm1      // write back updated value
-                                // mm1 will be used as Raw(x-bpp) next loop
-            jb dpth4lp
-         } // end _asm block
-      }
-      break;
-      case 8:                          // bpp == 8
-      {
-         ActiveMask.use  = 0x00000000ffffffff;
-         _asm {
-            mov ebx, diff
-            mov edi, row
-            mov esi, prev_row
-            pxor mm0, mm0
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            movq mm1, [edi+ebx-8]      // Only time should need to read
-                                       //  a=Raw(x-bpp) bytes
-dpth8lp:
-            // Do first set of 4 bytes
-            movq mm3, [esi+ebx-8]      // read c=Prior(x-bpp) bytes
-            punpcklbw mm1, mm0         // Unpack Low bytes of a
-            movq mm2, [esi + ebx]      // load b=Prior(x)
-            punpcklbw mm2, mm0         // Unpack Low bytes of b
-            // pav = p - a = (a + b - c) - a = b - c
-            movq mm4, mm2
-            punpcklbw mm3, mm0         // Unpack Low bytes of c
-            // pbv = p - b = (a + b - c) - b = a - c
-            movq mm5, mm1
-            psubw mm4, mm3
-            pxor mm7, mm7
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            movq mm6, mm4
-            psubw mm5, mm3
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
-            paddw mm6, mm5
-            pand mm0, mm4          // Only pav bytes < 0 in mm7
-            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
-            psubw mm4, mm0
-            pand mm7, mm5          // Only pbv bytes < 0 in mm0
-            psubw mm4, mm0
-            psubw mm5, mm7
-            pxor mm0, mm0
-            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
-            pand mm0, mm6          // Only pav bytes < 0 in mm7
-            psubw mm5, mm7
-            psubw mm6, mm0
-            //  test pa <= pb
-            movq mm7, mm4
-            psubw mm6, mm0
-            pcmpgtw mm7, mm5       // pa > pb?
-            movq mm0, mm7
-            // use mm7 mask to merge pa & pb
-            pand mm5, mm7
-            // use mm0 mask copy to merge a & b
-            pand mm2, mm0
-            pandn mm7, mm4
-            pandn mm0, mm1
-            paddw mm7, mm5
-            paddw mm0, mm2
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6       // pab > pc?
-            pxor mm1, mm1
-            pand mm3, mm7
-            pandn mm7, mm0
-            paddw mm7, mm3
-            pxor mm0, mm0
-            packuswb mm7, mm1
-            movq mm3, [esi+ebx-8]    // read c=Prior(x-bpp) bytes
-            pand mm7, ActiveMask
-            movq mm2, [esi + ebx]    // load b=Prior(x)
-            paddb mm7, [edi + ebx]   // add Paeth predictor with Raw(x)
-            punpckhbw mm3, mm0       // Unpack High bytes of c
-            movq [edi + ebx], mm7    // write back updated value
-            movq mm1, [edi+ebx-8]    // read a=Raw(x-bpp) bytes
-
-            // Do second set of 4 bytes
-            punpckhbw mm2, mm0       // Unpack High bytes of b
-            punpckhbw mm1, mm0       // Unpack High bytes of a
-            // pav = p - a = (a + b - c) - a = b - c
-            movq mm4, mm2
-            // pbv = p - b = (a + b - c) - b = a - c
-            movq mm5, mm1
-            psubw mm4, mm3
-            pxor mm7, mm7
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            movq mm6, mm4
-            psubw mm5, mm3
-            // pa = abs(p-a) = abs(pav)
-            // pb = abs(p-b) = abs(pbv)
-            // pc = abs(p-c) = abs(pcv)
-            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
-            paddw mm6, mm5
-            pand mm0, mm4          // Only pav bytes < 0 in mm7
-            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
-            psubw mm4, mm0
-            pand mm7, mm5          // Only pbv bytes < 0 in mm0
-            psubw mm4, mm0
-            psubw mm5, mm7
-            pxor mm0, mm0
-            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
-            pand mm0, mm6          // Only pav bytes < 0 in mm7
-            psubw mm5, mm7
-            psubw mm6, mm0
-            //  test pa <= pb
-            movq mm7, mm4
-            psubw mm6, mm0
-            pcmpgtw mm7, mm5       // pa > pb?
-            movq mm0, mm7
-            // use mm7 mask to merge pa & pb
-            pand mm5, mm7
-            // use mm0 mask copy to merge a & b
-            pand mm2, mm0
-            pandn mm7, mm4
-            pandn mm0, mm1
-            paddw mm7, mm5
-            paddw mm0, mm2
-            //  test  ((pa <= pb)? pa:pb) <= pc
-            pcmpgtw mm7, mm6       // pab > pc?
-            pxor mm1, mm1
-            pand mm3, mm7
-            pandn mm7, mm0
-            pxor mm1, mm1
-            paddw mm7, mm3
-            pxor mm0, mm0
-            // Step ex to next set of 8 bytes and repeat loop til done
-            add ebx, 8
-            packuswb mm1, mm7
-            paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
-            cmp ebx, MMXLength
-            movq [edi + ebx - 8], mm1      // write back updated value
-                            // mm1 will be used as Raw(x-bpp) next loop
-            jb dpth8lp
-         } // end _asm block
-      }
-      break;
-
-      case 1:                // bpp = 1
-      case 2:                // bpp = 2
-      default:               // bpp > 8
-      {
-         _asm {
-            mov ebx, diff
-            cmp ebx, FullLength
-            jnb dpthdend
-            mov edi, row
-            mov esi, prev_row
-            // Do Paeth decode for remaining bytes
-            mov edx, ebx
-            xor ecx, ecx        // zero ecx before using cl & cx in loop below
-            sub edx, bpp        // Set edx = ebx - bpp
-dpthdlp:
-            xor eax, eax
-            // pav = p - a = (a + b - c) - a = b - c
-            mov al, [esi + ebx]        // load Prior(x) into al
-            mov cl, [esi + edx]        // load Prior(x-bpp) into cl
-            sub eax, ecx                 // subtract Prior(x-bpp)
-            mov patemp, eax                 // Save pav for later use
-            xor eax, eax
-            // pbv = p - b = (a + b - c) - b = a - c
-            mov al, [edi + edx]        // load Raw(x-bpp) into al
-            sub eax, ecx                 // subtract Prior(x-bpp)
-            mov ecx, eax
-            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-            add eax, patemp                 // pcv = pav + pbv
-            // pc = abs(pcv)
-            test eax, 0x80000000
-            jz dpthdpca
-            neg eax                     // reverse sign of neg values
-dpthdpca:
-            mov pctemp, eax             // save pc for later use
-            // pb = abs(pbv)
-            test ecx, 0x80000000
-            jz dpthdpba
-            neg ecx                     // reverse sign of neg values
-dpthdpba:
-            mov pbtemp, ecx             // save pb for later use
-            // pa = abs(pav)
-            mov eax, patemp
-            test eax, 0x80000000
-            jz dpthdpaa
-            neg eax                     // reverse sign of neg values
-dpthdpaa:
-            mov patemp, eax             // save pa for later use
-            // test if pa <= pb
-            cmp eax, ecx
-            jna dpthdabb
-            // pa > pb; now test if pb <= pc
-            cmp ecx, pctemp
-            jna dpthdbbc
-            // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-            mov cl, [esi + edx]  // load Prior(x-bpp) into cl
-            jmp dpthdpaeth
-dpthdbbc:
-            // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
-            mov cl, [esi + ebx]        // load Prior(x) into cl
-            jmp dpthdpaeth
-dpthdabb:
-            // pa <= pb; now test if pa <= pc
-            cmp eax, pctemp
-            jna dpthdabc
-            // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-            mov cl, [esi + edx]  // load Prior(x-bpp) into cl
-            jmp dpthdpaeth
-dpthdabc:
-            // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
-            mov cl, [edi + edx]  // load Raw(x-bpp) into cl
-dpthdpaeth:
-            inc ebx
-            inc edx
-            // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
-            add [edi + ebx - 1], cl
-            cmp ebx, FullLength
-            jb dpthdlp
-dpthdend:
-         } // end _asm block
-      }
-      return;                   // No need to go further with this one
-   }                         // end switch ( bpp )
-   _asm
-   {
-         // MMX acceleration complete now do clean-up
-         // Check if any remaining bytes left to decode
-         mov ebx, MMXLength
-         cmp ebx, FullLength
-         jnb dpthend
-         mov edi, row
-         mov esi, prev_row
-         // Do Paeth decode for remaining bytes
-         mov edx, ebx
-         xor ecx, ecx         // zero ecx before using cl & cx in loop below
-         sub edx, bpp         // Set edx = ebx - bpp
-dpthlp2:
-         xor eax, eax
-         // pav = p - a = (a + b - c) - a = b - c
-         mov al, [esi + ebx]  // load Prior(x) into al
-         mov cl, [esi + edx]  // load Prior(x-bpp) into cl
-         sub eax, ecx         // subtract Prior(x-bpp)
-         mov patemp, eax      // Save pav for later use
-         xor eax, eax
-         // pbv = p - b = (a + b - c) - b = a - c
-         mov al, [edi + edx]  // load Raw(x-bpp) into al
-         sub eax, ecx         // subtract Prior(x-bpp)
-         mov ecx, eax
-         // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
-         add eax, patemp      // pcv = pav + pbv
-         // pc = abs(pcv)
-         test eax, 0x80000000
-         jz dpthpca2
-         neg eax              // reverse sign of neg values
-dpthpca2:
-         mov pctemp, eax      // save pc for later use
-         // pb = abs(pbv)
-         test ecx, 0x80000000
-         jz dpthpba2
-         neg ecx              // reverse sign of neg values
-dpthpba2:
-         mov pbtemp, ecx      // save pb for later use
-         // pa = abs(pav)
-         mov eax, patemp
-         test eax, 0x80000000
-         jz dpthpaa2
-         neg eax              // reverse sign of neg values
-dpthpaa2:
-         mov patemp, eax      // save pa for later use
-         // test if pa <= pb
-         cmp eax, ecx
-         jna dpthabb2
-         // pa > pb; now test if pb <= pc
-         cmp ecx, pctemp
-         jna dpthbbc2
-         // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-         mov cl, [esi + edx]  // load Prior(x-bpp) into cl
-         jmp dpthpaeth2
-dpthbbc2:
-         // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
-         mov cl, [esi + ebx]        // load Prior(x) into cl
-         jmp dpthpaeth2
-dpthabb2:
-         // pa <= pb; now test if pa <= pc
-         cmp eax, pctemp
-         jna dpthabc2
-         // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
-         mov cl, [esi + edx]  // load Prior(x-bpp) into cl
-         jmp dpthpaeth2
-dpthabc2:
-         // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
-         mov cl, [edi + edx]  // load Raw(x-bpp) into cl
-dpthpaeth2:
-         inc ebx
-         inc edx
-         // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
-         add [edi + ebx - 1], cl
-         cmp ebx, FullLength
-         jb dpthlp2
-dpthend:
-         emms             // End MMX instructions; prep for possible FP instrs.
-   } // end _asm block
-}
-
-// Optimized code for PNG Sub filter decoder
-void /* PRIVATE */
-png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
-{
-  // These variables are declared
-  // here to ensure alignment on 8-byte boundaries.
-  union uAll ActiveMask, ShiftBpp, ShiftRem;
-
-   //int test;
-   int bpp;
-   png_uint_32 FullLength;
-   png_uint_32 MMXLength;
-   int diff;
-
-   bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
-   FullLength  = row_info->rowbytes - bpp; // # of bytes to filter
-   _asm {
-        mov edi, row
-        mov esi, edi               // lp = row
-        add edi, bpp               // rp = row + bpp
-        xor eax, eax
-        // get # of bytes to alignment
-        mov diff, edi               // take start of row
-        add diff, 0xf               // add 7 + 8 to incr past
-                                        // alignment boundary
-        xor ebx, ebx
-        and diff, 0xfffffff8        // mask to alignment boundary
-        sub diff, edi               // subtract from start ==> value
-                                        //  ebx at alignment
-        jz dsubgo
-        // fix alignment
-dsublp1:
-        mov al, [esi+ebx]
-        add [edi+ebx], al
-        inc ebx
-        cmp ebx, diff
-        jb dsublp1
-dsubgo:
-        mov ecx, FullLength
-        mov edx, ecx
-        sub edx, ebx                  // subtract alignment fix
-        and edx, 0x00000007           // calc bytes over mult of 8
-        sub ecx, edx                  // drop over bytes from length
-        mov MMXLength, ecx
-   } // end _asm block
-
-   // Now do the math for the rest of the row
-   switch ( bpp )
-   {
-        case 3:
-        {
-         ActiveMask.use  = 0x0000ffffff000000;
-         ShiftBpp.use = 24;       // == 3 * 8
-         ShiftRem.use  = 40;      // == 64 - 24
-         _asm {
-            mov edi, row
-            movq mm7, ActiveMask  // Load ActiveMask for 2nd active byte group
-            mov esi, edi              // lp = row
-            add edi, bpp          // rp = row + bpp
-            movq mm6, mm7
-            mov ebx, diff
-            psllq mm6, ShiftBpp   // Move mask in mm6 to cover 3rd active
-                                  // byte group
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            movq mm1, [edi+ebx-8]
-dsub3lp:
-            psrlq mm1, ShiftRem   // Shift data for adding 1st bpp bytes
-                          // no need for mask; shift clears inactive bytes
-            // Add 1st active group
-            movq mm0, [edi+ebx]
-            paddb mm0, mm1
-            // Add 2nd active group
-            movq mm1, mm0         // mov updated Raws to mm1
-            psllq mm1, ShiftBpp   // shift data to position correctly
-            pand mm1, mm7         // mask to use only 2nd active group
-            paddb mm0, mm1
-            // Add 3rd active group
-            movq mm1, mm0         // mov updated Raws to mm1
-            psllq mm1, ShiftBpp   // shift data to position correctly
-            pand mm1, mm6         // mask to use only 3rd active group
-            add ebx, 8
-            paddb mm0, mm1
-            cmp ebx, MMXLength
-            movq [edi+ebx-8], mm0     // Write updated Raws back to array
-            // Prep for doing 1st add at top of loop
-            movq mm1, mm0
-            jb dsub3lp
-         } // end _asm block
-      }
-      break;
-
-      case 1:
-      {
-         // Placed here just in case this is a duplicate of the
-         // non-MMX code for the SUB filter in png_read_filter_row below
-         //
-         //         png_bytep rp;
-         //         png_bytep lp;
-         //         png_uint_32 i;
-         //         bpp = (row_info->pixel_depth + 7) >> 3;
-         //         for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
-         //            i < row_info->rowbytes; i++, rp++, lp++)
-         //      {
-         //            *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
-         //      }
-         _asm {
-            mov ebx, diff
-            mov edi, row
-            cmp ebx, FullLength
-            jnb dsub1end
-            mov esi, edi          // lp = row
-            xor eax, eax
-            add edi, bpp      // rp = row + bpp
-dsub1lp:
-            mov al, [esi+ebx]
-            add [edi+ebx], al
-            inc ebx
-            cmp ebx, FullLength
-            jb dsub1lp
-dsub1end:
-         } // end _asm block
-      }
-      return;
-
-      case 6:
-      case 7:
-      case 4:
-      case 5:
-      {
-         ShiftBpp.use = bpp << 3;
-         ShiftRem.use = 64 - ShiftBpp.use;
-         _asm {
-            mov edi, row
-            mov ebx, diff
-            mov esi, edi               // lp = row
-            add edi, bpp           // rp = row + bpp
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            movq mm1, [edi+ebx-8]
-dsub4lp:
-            psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
-                          // no need for mask; shift clears inactive bytes
-            movq mm0, [edi+ebx]
-            paddb mm0, mm1
-            // Add 2nd active group
-            movq mm1, mm0          // mov updated Raws to mm1
-            psllq mm1, ShiftBpp    // shift data to position correctly
-                                   // there is no need for any mask
-                                   // since shift clears inactive bits/bytes
-            add ebx, 8
-            paddb mm0, mm1
-            cmp ebx, MMXLength
-            movq [edi+ebx-8], mm0
-            movq mm1, mm0          // Prep for doing 1st add at top of loop
-            jb dsub4lp
-         } // end _asm block
-      }
-      break;
-
-      case 2:
-      {
-         ActiveMask.use  = 0x00000000ffff0000;
-         ShiftBpp.use = 16;       // == 2 * 8
-         ShiftRem.use = 48;       // == 64 - 16
-         _asm {
-            movq mm7, ActiveMask  // Load ActiveMask for 2nd active byte group
-            mov ebx, diff
-            movq mm6, mm7
-            mov edi, row
-            psllq mm6, ShiftBpp     // Move mask in mm6 to cover 3rd active
-                                    //  byte group
-            mov esi, edi            // lp = row
-            movq mm5, mm6
-            add edi, bpp            // rp = row + bpp
-            psllq mm5, ShiftBpp     // Move mask in mm5 to cover 4th active
-                                    //  byte group
-            // PRIME the pump (load the first Raw(x-bpp) data set
-            movq mm1, [edi+ebx-8]
-dsub2lp:
-            // Add 1st active group
-            psrlq mm1, ShiftRem     // Shift data for adding 1st bpp bytes
-                                    // no need for mask; shift clears inactive
-                                    //  bytes
-            movq mm0, [edi+ebx]
-            paddb mm0, mm1
-            // Add 2nd active group
-            movq mm1, mm0           // mov updated Raws to mm1
-            psllq mm1, ShiftBpp     // shift data to position correctly
-            pand mm1, mm7           // mask to use only 2nd active group
-            paddb mm0, mm1
-            // Add 3rd active group
-            movq mm1, mm0           // mov updated Raws to mm1
-            psllq mm1, ShiftBpp     // shift data to position correctly
-            pand mm1, mm6           // mask to use only 3rd active group
-            paddb mm0, mm1
-            // Add 4th active group
-            movq mm1, mm0           // mov updated Raws to mm1
-            psllq mm1, ShiftBpp     // shift data to position correctly
-            pand mm1, mm5           // mask to use only 4th active group
-            add ebx, 8
-            paddb mm0, mm1
-            cmp ebx, MMXLength
-            movq [edi+ebx-8], mm0   // Write updated Raws back to array
-            movq mm1, mm0           // Prep for doing 1st add at top of loop
-            jb dsub2lp
-         } // end _asm block
-      }
-      break;
-      case 8:
-      {
-         _asm {
-            mov edi, row
-            mov ebx, diff
-            mov esi, edi            // lp = row
-            add edi, bpp            // rp = row + bpp
-            mov ecx, MMXLength
-            movq mm7, [edi+ebx-8]   // PRIME the pump (load the first
-                                    // Raw(x-bpp) data set
-            and ecx, 0x0000003f     // calc bytes over mult of 64
-dsub8lp:
-            movq mm0, [edi+ebx]     // Load Sub(x) for 1st 8 bytes
-            paddb mm0, mm7
-            movq mm1, [edi+ebx+8]   // Load Sub(x) for 2nd 8 bytes
-            movq [edi+ebx], mm0    // Write Raw(x) for 1st 8 bytes
-                                   // Now mm0 will be used as Raw(x-bpp) for
-                                   // the 2nd group of 8 bytes.  This will be
-                                   // repeated for each group of 8 bytes with
-                                   // the 8th group being used as the Raw(x-bpp)
-                                   // for the 1st group of the next loop.
-            paddb mm1, mm0
-            movq mm2, [edi+ebx+16]  // Load Sub(x) for 3rd 8 bytes
-            movq [edi+ebx+8], mm1   // Write Raw(x) for 2nd 8 bytes
-            paddb mm2, mm1
-            movq mm3, [edi+ebx+24]  // Load Sub(x) for 4th 8 bytes
-            movq [edi+ebx+16], mm2  // Write Raw(x) for 3rd 8 bytes
-            paddb mm3, mm2
-            movq mm4, [edi+ebx+32]  // Load Sub(x) for 5th 8 bytes
-            movq [edi+ebx+24], mm3  // Write Raw(x) for 4th 8 bytes
-            paddb mm4, mm3
-            movq mm5, [edi+ebx+40]  // Load Sub(x) for 6th 8 bytes
-            movq [edi+ebx+32], mm4  // Write Raw(x) for 5th 8 bytes
-            paddb mm5, mm4
-            movq mm6, [edi+ebx+48]  // Load Sub(x) for 7th 8 bytes
-            movq [edi+ebx+40], mm5  // Write Raw(x) for 6th 8 bytes
-            paddb mm6, mm5
-            movq mm7, [edi+ebx+56]  // Load Sub(x) for 8th 8 bytes
-            movq [edi+ebx+48], mm6  // Write Raw(x) for 7th 8 bytes
-            add ebx, 64
-            paddb mm7, mm6
-            cmp ebx, ecx
-            movq [edi+ebx-8], mm7   // Write Raw(x) for 8th 8 bytes
-            jb dsub8lp
-            cmp ebx, MMXLength
-            jnb dsub8lt8
-dsub8lpA:
-            movq mm0, [edi+ebx]
-            add ebx, 8
-            paddb mm0, mm7
-            cmp ebx, MMXLength
-            movq [edi+ebx-8], mm0   // use -8 to offset early add to ebx
-            movq mm7, mm0           // Move calculated Raw(x) data to mm1 to
-                                    // be the new Raw(x-bpp) for the next loop
-            jb dsub8lpA
-dsub8lt8:
-         } // end _asm block
-      }
-      break;
-
-      default:                // bpp greater than 8 bytes
-      {
-         _asm {
-            mov ebx, diff
-            mov edi, row
-            mov esi, edi           // lp = row
-            add edi, bpp           // rp = row + bpp
-dsubAlp:
-            movq mm0, [edi+ebx]
-            movq mm1, [esi+ebx]
-            add ebx, 8
-            paddb mm0, mm1
-            cmp ebx, MMXLength
-            movq [edi+ebx-8], mm0  // mov does not affect flags; -8 to offset
-                                   //  add ebx
-            jb dsubAlp
-         } // end _asm block
-      }
-      break;
-
-   } // end switch ( bpp )
-
-   _asm {
-        mov ebx, MMXLength
-        mov edi, row
-        cmp ebx, FullLength
-        jnb dsubend
-        mov esi, edi               // lp = row
-        xor eax, eax
-        add edi, bpp               // rp = row + bpp
-dsublp2:
-        mov al, [esi+ebx]
-        add [edi+ebx], al
-        inc ebx
-        cmp ebx, FullLength
-        jb dsublp2
-dsubend:
-        emms             // End MMX instructions; prep for possible FP instrs.
-   } // end _asm block
-}
-
-// Optimized code for PNG Up filter decoder
-void /* PRIVATE */
-png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
-   png_bytep prev_row)
-{
-   png_uint_32 len;
-   len  = row_info->rowbytes;       // # of bytes to filter
-   _asm {
-      mov edi, row
-      // get # of bytes to alignment
-      mov ecx, edi
-      xor ebx, ebx
-      add ecx, 0x7
-      xor eax, eax
-      and ecx, 0xfffffff8
-      mov esi, prev_row
-      sub ecx, edi
-      jz dupgo
-      // fix alignment
-duplp1:
-      mov al, [edi+ebx]
-      add al, [esi+ebx]
-      inc ebx
-      cmp ebx, ecx
-      mov [edi + ebx-1], al  // mov does not affect flags; -1 to offset inc ebx
-      jb duplp1
-dupgo:
-      mov ecx, len
-      mov edx, ecx
-      sub edx, ebx                  // subtract alignment fix
-      and edx, 0x0000003f           // calc bytes over mult of 64
-      sub ecx, edx                  // drop over bytes from length
-      // Unrolled loop - use all MMX registers and interleave to reduce
-      // number of branch instructions (loops) and reduce partial stalls
-duploop:
-      movq mm1, [esi+ebx]
-      movq mm0, [edi+ebx]
-      movq mm3, [esi+ebx+8]
-      paddb mm0, mm1
-      movq mm2, [edi+ebx+8]
-      movq [edi+ebx], mm0
-      paddb mm2, mm3
-      movq mm5, [esi+ebx+16]
-      movq [edi+ebx+8], mm2
-      movq mm4, [edi+ebx+16]
-      movq mm7, [esi+ebx+24]
-      paddb mm4, mm5
-      movq mm6, [edi+ebx+24]
-      movq [edi+ebx+16], mm4
-      paddb mm6, mm7
-      movq mm1, [esi+ebx+32]
-      movq [edi+ebx+24], mm6
-      movq mm0, [edi+ebx+32]
-      movq mm3, [esi+ebx+40]
-      paddb mm0, mm1
-      movq mm2, [edi+ebx+40]
-      movq [edi+ebx+32], mm0
-      paddb mm2, mm3
-      movq mm5, [esi+ebx+48]
-      movq [edi+ebx+40], mm2
-      movq mm4, [edi+ebx+48]
-      movq mm7, [esi+ebx+56]
-      paddb mm4, mm5
-      movq mm6, [edi+ebx+56]
-      movq [edi+ebx+48], mm4
-      add ebx, 64
-      paddb mm6, mm7
-      cmp ebx, ecx
-      movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
-                                     // -8 to offset add ebx
-      jb duploop
-
-      cmp edx, 0                     // Test for bytes over mult of 64
-      jz dupend
-
-
-      // 2 lines added by lcreeve at netins.net
-      // (mail 11 Jul 98 in png-implement list)
-      cmp edx, 8 //test for less than 8 bytes
-      jb duplt8
-
-
-      add ecx, edx
-      and edx, 0x00000007           // calc bytes over mult of 8
-      sub ecx, edx                  // drop over bytes from length
-      jz duplt8
-      // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
-duplpA:
-      movq mm1, [esi+ebx]
-      movq mm0, [edi+ebx]
-      add ebx, 8
-      paddb mm0, mm1
-      cmp ebx, ecx
-      movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
-      jb duplpA
-      cmp edx, 0            // Test for bytes over mult of 8
-      jz dupend
-duplt8:
-      xor eax, eax
-      add ecx, edx          // move over byte count into counter
-      // Loop using x86 registers to update remaining bytes
-duplp2:
-      mov al, [edi + ebx]
-      add al, [esi + ebx]
-      inc ebx
-      cmp ebx, ecx
-      mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
-      jb duplp2
-dupend:
-      // Conversion of filtered row completed
-      emms          // End MMX instructions; prep for possible FP instrs.
-   } // end _asm block
-}
-
-
-// Optimized png_read_filter_row routines
-void /* PRIVATE */
-png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
-   row, png_bytep prev_row, int filter)
-{
-#ifdef PNG_DEBUG
-   char filnm[10];
-#endif
-
-   if (mmx_supported == 2) {
-#if !defined(PNG_1_0_X)
-       /* this should have happened in png_init_mmx_flags() already */
-       png_warning(png_ptr, "asm_flags may not have been initialized");
-#endif
-       png_mmx_support();
-   }
-
-#ifdef PNG_DEBUG
-   png_debug(1, "in png_read_filter_row\n");
-   switch (filter)
-   {
-      case 0: png_snprintf(filnm, 10, "none");
-         break;
-#if !defined(PNG_1_0_X)
-      case 1: png_snprintf(filnm, 10, "sub-%s",
-        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : "x86");
-         break;
-      case 2: png_snprintf(filnm, 10, "up-%s",
-        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : "x86");
-         break;
-      case 3: png_snprintf(filnm, 10, "avg-%s",
-        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : "x86");
-         break;
-      case 4: png_snprintf(filnm, 10, "Paeth-%s",
-        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":"x86");
-         break;
-#else
-      case 1: png_snprintf(filnm, 10, "sub");
-         break;
-      case 2: png_snprintf(filnm, 10, "up");
-         break;
-      case 3: png_snprintf(filnm, 10, "avg");
-         break;
-      case 4: png_snprintf(filnm, 10, "Paeth");
-         break;
-#endif
-      default: png_snprintf(filnm, 10, "unknw");
-         break;
-   }
-   png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
-   png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
-      (int)((row_info->pixel_depth + 7) >> 3));
-   png_debug1(0,"len=%8d, ", row_info->rowbytes);
-#endif /* PNG_DEBUG */
-
-   switch (filter)
-   {
-      case PNG_FILTER_VALUE_NONE:
-         break;
-
-      case PNG_FILTER_VALUE_SUB:
-      {
-#if !defined(PNG_1_0_X)
-         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
-             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-         if (mmx_supported)
-#endif
-         {
-            png_read_filter_row_mmx_sub(row_info, row);
-         }
-         else
-         {
-            png_uint_32 i;
-            png_uint_32 istop = row_info->rowbytes;
-            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
-            png_bytep rp = row + bpp;
-            png_bytep lp = row;
-
-            for (i = bpp; i < istop; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
-               rp++;
-            }
-         }
-         break;
-      }
-
-      case PNG_FILTER_VALUE_UP:
-      {
-#if !defined(PNG_1_0_X)
-         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
-             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-         if (mmx_supported)
-#endif
-         {
-            png_read_filter_row_mmx_up(row_info, row, prev_row);
-         }
-         else
-         {
-            png_uint_32 i;
-            png_uint_32 istop = row_info->rowbytes;
-            png_bytep rp = row;
-            png_bytep pp = prev_row;
-
-            for (i = 0; i < istop; ++i)
-            {
-               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
-               rp++;
-            }
-         }
-         break;
-      }
-
-      case PNG_FILTER_VALUE_AVG:
-      {
-#if !defined(PNG_1_0_X)
-         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
-             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-         if (mmx_supported)
-#endif
-         {
-            png_read_filter_row_mmx_avg(row_info, row, prev_row);
-         }
-         else
-         {
-            png_uint_32 i;
-            png_bytep rp = row;
-            png_bytep pp = prev_row;
-            png_bytep lp = row;
-            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
-            png_uint_32 istop = row_info->rowbytes - bpp;
-
-            for (i = 0; i < bpp; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) +
-                  ((int)(*pp++) >> 1)) & 0xff);
-               rp++;
-            }
-
-            for (i = 0; i < istop; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) +
-                  ((int)(*pp++ + *lp++) >> 1)) & 0xff);
-               rp++;
-            }
-         }
-         break;
-      }
-
-      case PNG_FILTER_VALUE_PAETH:
-      {
-#if !defined(PNG_1_0_X)
-         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
-             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
-             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
-#else
-         if (mmx_supported)
-#endif
-         {
-            png_read_filter_row_mmx_paeth(row_info, row, prev_row);
-         }
-         else
-         {
-            png_uint_32 i;
-            png_bytep rp = row;
-            png_bytep pp = prev_row;
-            png_bytep lp = row;
-            png_bytep cp = prev_row;
-            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
-            png_uint_32 istop=row_info->rowbytes - bpp;
-
-            for (i = 0; i < bpp; i++)
-            {
-               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
-               rp++;
-            }
-
-            for (i = 0; i < istop; i++)   // use leftover rp,pp
-            {
-               int a, b, c, pa, pb, pc, p;
-
-               a = *lp++;
-               b = *pp++;
-               c = *cp++;
-
-               p = b - c;
-               pc = a - c;
-
-#ifdef PNG_USE_ABS
-               pa = abs(p);
-               pb = abs(pc);
-               pc = abs(p + pc);
-#else
-               pa = p < 0 ? -p : p;
-               pb = pc < 0 ? -pc : pc;
-               pc = (p + pc) < 0 ? -(p + pc) : p + pc;
-#endif
-
-               /*
-                  if (pa <= pb && pa <= pc)
-                     p = a;
-                  else if (pb <= pc)
-                     p = b;
-                  else
-                     p = c;
-                */
-
-               p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
-
-               *rp = (png_byte)(((int)(*rp) + p) & 0xff);
-               rp++;
-            }
-         }
-         break;
-      }
-
-      default:
-         png_warning(png_ptr, "Ignoring bad row filter type");
-         *row=0;
-         break;
-   }
-}
-
-#endif /* PNG_MMX_CODE_SUPPORTED && PNG_USE_PNGVCRD */
+/* pnggvrd.c was removed from libpng-1.2.20. */
diff --git a/pngwrite.c b/pngwrite.c
index 8d5b98a..c6df1ef 100644
--- a/pngwrite.c
+++ b/pngwrite.c
@@ -454,14 +454,6 @@
    if (png_ptr == NULL)
       return (NULL);
 
-#if !defined(PNG_1_0_X)
-#ifdef PNG_ASSEMBLER_CODE_SUPPORTED
-#ifdef PNG_MMX_CODE_SUPPORTED
-   png_init_mmx_flags(png_ptr);   /* 1.2.0 addition */
-#endif
-#endif
-#endif /* PNG_1_0_X */
-
    /* added at libpng-1.2.6 */
 #ifdef PNG_SET_USER_LIMITS_SUPPORTED
    png_ptr->user_width_max=PNG_USER_WIDTH_MAX;
@@ -670,14 +662,6 @@
    png_ptr->user_height_max=PNG_USER_HEIGHT_MAX;
 #endif
 
-#if !defined(PNG_1_0_X)
-#ifdef PNG_ASSEMBLER_CODE_SUPPORTED
-#ifdef PNG_MMX_CODE_SUPPORTED
-   png_init_mmx_flags(png_ptr);   /* 1.2.0 addition */
-#endif
-#endif
-#endif /* PNG_1_0_X */
-
 #ifdef PNG_SETJMP_SUPPORTED
    /* restore jump buffer */
    png_memcpy(png_ptr->jmpbuf, tmp_jmp, png_sizeof (jmp_buf));
diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt
index 47c57a2..b948607 100644
--- a/scripts/CMakeLists.txt
+++ b/scripts/CMakeLists.txt
@@ -62,8 +62,6 @@
  endif("$ENV{${TEXT}}" STREQUAL "")
 endif(NOT WIN32)
 
-option(PNG_MMX "Use MMX assembler code (x86 only)" ${png_asm_tmp})
-
 # SET LIBNAME
 # msvc does not append 'lib' - do it here to have consistent name
 if(MSVC)
@@ -101,21 +99,8 @@
 # SOME NEEDED DEFINITIONS
 add_definitions(-DZLIB_DLL)
 
-if(PNG_MMX)
- if(MSVC)
-  add_definitions(-DPNG_NO_MODULEDEF -D_CRT_SECURE_NO_DEPRECATE)
-  set(libpng_sources ${libpng_sources}
-          pngvcrd.c
-  )
- else(MSVC)
-  set(libpng_sources ${libpng_sources}
-          pnggccrd.c
-  )
- endif(MSVC)
-else(PNG_MMX)
-  add_definitions(-DLIBPNG_NO_MMX)
-  add_definitions(-DPNG_NO_MMX_CODE)
-endif(PNG_MMX)
+add_definitions(-DLIBPNG_NO_MMX)
+add_definitions(-DPNG_NO_MMX_CODE)
 
 if(PNG_CONSOLE_IO_SUPPORTED)
  add_definitions(-DPNG_CONSOLE_IO_SUPPORTED)
@@ -180,7 +165,7 @@
 
 # SET UP LINKS
 set_target_properties(${PNG_LIB_NAME} PROPERTIES
-#    VERSION 0.${PNGLIB_RELEASE}.1.2.20rc3
+#    VERSION 0.${PNGLIB_RELEASE}.1.2.20rc4
      VERSION 0.${PNGLIB_RELEASE}.0
      SOVERSION 0
      CLEAN_DIRECT_OUTPUT 1)
diff --git a/scripts/libpng-config-head.in b/scripts/libpng-config-head.in
index 0289369..8f8e29b 100755
--- a/scripts/libpng-config-head.in
+++ b/scripts/libpng-config-head.in
@@ -8,7 +8,7 @@
 
 # Modeled after libxml-config.
 
-version=1.2.20rc3
+version=1.2.20rc4
 prefix=""
 libdir=""
 libs=""
diff --git a/scripts/libpng.pc-configure.in b/scripts/libpng.pc-configure.in
index 03e08e2..ab39e20 100644
--- a/scripts/libpng.pc-configure.in
+++ b/scripts/libpng.pc-configure.in
@@ -5,6 +5,6 @@
 
 Name: libpng
 Description: Loads and saves PNG files
-Version: 1.2.20rc3
+Version: 1.2.20rc4
 Libs: -L${libdir} -lpng12
 Cflags: -I${includedir} @LIBPNG_NO_MMX@
diff --git a/scripts/libpng.pc.in b/scripts/libpng.pc.in
index d3e688d..d83e2d8 100644
--- a/scripts/libpng.pc.in
+++ b/scripts/libpng.pc.in
@@ -5,6 +5,6 @@
 
 Name: libpng
 Description: Loads and saves PNG files
-Version: 1.2.20rc3
+Version: 1.2.20rc4
 Libs: -L${libdir} -lpng12
 Cflags: -I${includedir}
diff --git a/scripts/makefile.32sunu b/scripts/makefile.32sunu
index fb4dc45..1255998 100644
--- a/scripts/makefile.32sunu
+++ b/scripts/makefile.32sunu
@@ -8,7 +8,7 @@
 # Library name:
 LIBNAME=libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.64sunu b/scripts/makefile.64sunu
index 0b8564f..aed69ed 100644
--- a/scripts/makefile.64sunu
+++ b/scripts/makefile.64sunu
@@ -8,7 +8,7 @@
 # Library name:
 LIBNAME=libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.aix b/scripts/makefile.aix
index 1d31f0d..f32233f 100644
--- a/scripts/makefile.aix
+++ b/scripts/makefile.aix
@@ -20,7 +20,7 @@
 
 LIBNAME=libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 prefix=/usr/local
diff --git a/scripts/makefile.beos b/scripts/makefile.beos
index d595233..15449c4 100644
--- a/scripts/makefile.beos
+++ b/scripts/makefile.beos
@@ -8,7 +8,7 @@
 # Library name:
 LIBNAME=libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.cygwin b/scripts/makefile.cygwin
index 701e6b1..2812ac2 100644
--- a/scripts/makefile.cygwin
+++ b/scripts/makefile.cygwin
@@ -31,9 +31,6 @@
 
 DESTDIR=
 
-# To disable assembler optimizations, add '-DPNG_NO_MMX_CODE' to
-# $CFLAGS.  To enable, add pnggccrd.o to the dependencies.
-
 CC=gcc
 ifdef MINGW
 MINGW_CCFLAGS=-mno-cygwin -I/usr/include/mingw
@@ -77,7 +74,7 @@
 LIBNAME = libpng12
 PNGMAJ = 0
 CYGDLL = 12
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 SHAREDLIB=cygpng$(CYGDLL).dll
@@ -112,7 +109,7 @@
 
 OBJS = png.o pngset.o pngget.o pngrutil.o pngtrans.o pngwutil.o \
 	pngread.o pngrio.o pngwio.o pngwrite.o pngrtran.o \
-	pngwtran.o pngmem.o pngerror.o pngpread.o # pnggccrd.o
+	pngwtran.o pngmem.o pngerror.o pngpread.o
 
 OBJSDLL = $(OBJS:.o=.pic.o)
 
@@ -156,20 +153,6 @@
 all-static: $(STATLIB) pngtest-stat$(EXE)
 all-shared: $(SHAREDLIB) pngtest$(EXE)
 
-# pnggccrd.o: pnggccrd.c png.h pngconf.h
-#	@echo ""
-#	@echo '    You can ignore the "control reaches end of non-void function"'
-#	@echo '    warning and "<variable> defined but not used" warnings:'
-#	@echo ""
-#	$(CC) -c $(CFLAGS) -o $@ $<
-
-# pnggccrd.pic.o:	pnggccrd.c png.h pngconf.h
-#	@echo ""
-#	@echo '    You can ignore the "control reaches end of non-void function"'
-#	@echo '    warning and "<variable> defined but not used" warnings:'
-#	@echo ""
-#	$(CC) -c $(CFLAGS) -DPNG_BUILD_DLL -o $@ $<
-
 $(STATLIB): $(OBJS)
 	ar rc $@ $(OBJS)
 	$(RANLIB) $@
diff --git a/scripts/makefile.darwin b/scripts/makefile.darwin
index f84c070..2a27d1f 100644
--- a/scripts/makefile.darwin
+++ b/scripts/makefile.darwin
@@ -19,7 +19,7 @@
 # Library name:
 LIBNAME = libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.dec b/scripts/makefile.dec
index b18730f..a9d243e 100644
--- a/scripts/makefile.dec
+++ b/scripts/makefile.dec
@@ -5,7 +5,7 @@
 
 # Library name:
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 LIBNAME = libpng12
 
diff --git a/scripts/makefile.elf b/scripts/makefile.elf
index d7f49be..4d8e04c 100644
--- a/scripts/makefile.elf
+++ b/scripts/makefile.elf
@@ -12,7 +12,7 @@
 # Library name:
 LIBNAME = libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.gcmmx b/scripts/makefile.gcmmx
index a99bef5..1bc5215 100644
--- a/scripts/makefile.gcmmx
+++ b/scripts/makefile.gcmmx
@@ -7,8 +7,6 @@
 
 # CAUTION: Do not use this makefile with gcc versions 2.7.2.2 and earlier.
 
-# WARNING: The assembler code in pnggccrd.c may not be thread safe.
-
 # NOTE:  When testing MMX performance on a multitasking system, make sure
 #        there are no floating-point programs (e.g., SETI@Home) running in
 #        the background!  Context switches between MMX and FPU are expensive.
@@ -16,7 +14,7 @@
 # Library name:
 LIBNAME = libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
@@ -98,7 +96,7 @@
 
 OBJS = png.o pngset.o pngget.o pngrutil.o pngtrans.o pngwutil.o \
 	pngread.o pngrio.o pngwio.o pngwrite.o pngrtran.o \
-	pngwtran.o pngmem.o pngerror.o pngpread.o pnggccrd.o
+	pngwtran.o pngmem.o pngerror.o pngpread.o
 
 OBJSDLL = $(OBJS:.o=.pic.o)
 
@@ -127,12 +125,6 @@
 	cat scripts/libpng-config-body.in ) > libpng-config
 	chmod +x libpng-config
 
-pnggccrd.o:	pnggccrd.c png.h pngconf.h
-	$(CC) -c $(CFLAGS) -o $@ $*.c
-
-pnggccrd.pic.o:	pnggccrd.c png.h pngconf.h
-	$(CC) -c $(CFLAGS) -fPIC -o $@ pnggccrd.c
-
 $(LIBSO): $(LIBSOMAJ)
 	$(LN_SF) $(LIBSOMAJ) $(LIBSO)
 
diff --git a/scripts/makefile.hp64 b/scripts/makefile.hp64
index 6466612..38c896c 100644
--- a/scripts/makefile.hp64
+++ b/scripts/makefile.hp64
@@ -18,7 +18,7 @@
 # Library name:
 LIBNAME = libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.hpgcc b/scripts/makefile.hpgcc
index 3cfa3e3..34053ce 100644
--- a/scripts/makefile.hpgcc
+++ b/scripts/makefile.hpgcc
@@ -8,7 +8,7 @@
 # Library name:
 LIBNAME = libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.hpux b/scripts/makefile.hpux
index 543a14d..d92222d 100644
--- a/scripts/makefile.hpux
+++ b/scripts/makefile.hpux
@@ -18,7 +18,7 @@
 # Library name:
 LIBNAME = libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.intel b/scripts/makefile.intel
index a155ddd..b0b523a 100644
--- a/scripts/makefile.intel
+++ b/scripts/makefile.intel
@@ -36,7 +36,7 @@
 
 OBJS=png$(O) pngset$(O) pngget$(O) pngrutil$(O) pngtrans$(O) pngwutil$(O) \
 pngmem$(O) pngpread$(O) pngread$(O) pngerror$(O) pngwrite$(O) \
-pngrtran$(O) pngwtran$(O) pngrio$(O) pngwio$(O) pngvcrd$(O) pnggccrd$(O)
+pngrtran$(O) pngwtran$(O) pngrio$(O) pngwio$(O)
 
 all: test
 
@@ -61,12 +61,6 @@
 pngrutil$(O): png.h pngconf.h
 	$(CC) $(CFLAGS) $*.c $(ERRFILE)
 
-pnggccrd$(O): png.h pngconf.h
-	$(CC) $(CFLAGS) $*.c $(ERRFILE)
-
-pngvcrd$(O): png.h pngconf.h
-	$(CC) $(CFLAGS) $*.c $(ERRFILE)
-
 pngerror$(O): png.h pngconf.h
 	$(CC) $(CFLAGS) $*.c $(ERRFILE)
 
diff --git a/scripts/makefile.linux b/scripts/makefile.linux
index 9ee81f6..2c83c21 100644
--- a/scripts/makefile.linux
+++ b/scripts/makefile.linux
@@ -6,7 +6,7 @@
 # Library name:
 LIBNAME = libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.mingw b/scripts/makefile.mingw
index 09faefd..43963e8 100644
--- a/scripts/makefile.mingw
+++ b/scripts/makefile.mingw
@@ -39,10 +39,6 @@
 
 MKDIR_P=/bin/mkdir -pv
 
-
-# To disable assembler optimizations, add '-DPNG_NO_MMX_CODE' to
-# $CFLAGS.  To enable them, add pnggccrd.o to the dependencies.
-
 # Where "make install" puts libpng*.a, *png*.dll, png.h, and pngconf.h
 ifndef prefix
 prefix=/usr
@@ -78,7 +74,7 @@
 LIBNAME = libpng12
 PNGMAJ = 0
 MINGDLL = 12
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 SHAREDLIB=libpng$(MINGDLL).dll
@@ -109,7 +105,7 @@
 
 OBJS = png.o pngset.o pngget.o pngrutil.o pngtrans.o pngwutil.o \
 	pngread.o pngrio.o pngwio.o pngwrite.o pngrtran.o \
-	pngwtran.o pngmem.o pngerror.o pngpread.o # pnggccrd.o
+	pngwtran.o pngmem.o pngerror.o pngpread.o
 
 OBJSDLL = $(OBJS:.o=.pic.o)
 
@@ -282,7 +278,6 @@
 pngwtran.o pngwtran.pic.o:	png.h pngconf.h pngwtran.c
 pngwutil.o pngwutil.pic.o:	png.h pngconf.h pngwutil.c
 pngpread.o pngpread.pic.o:	png.h pngconf.h pngpread.c
-# pnggccrd.o pnggccrd.pic.o:	png.h pngconf.h pnggccrd.c
 
 pngtest.o pngtest.pic.o:	png.h pngconf.h pngtest.c
 
diff --git a/scripts/makefile.ne12bsd b/scripts/makefile.ne12bsd
index 422e516..de9c56a 100644
--- a/scripts/makefile.ne12bsd
+++ b/scripts/makefile.ne12bsd
@@ -14,10 +14,10 @@
 
 LIB=	png12
 SHLIB_MAJOR=	0
-SHLIB_MINOR=	1.2.20rc3
-SRCS=	pnggccrd.c png.c pngset.c pngget.c pngrutil.c pngtrans.c pngwutil.c \
-		pngread.c pngrio.c pngwio.c pngwrite.c pngrtran.c \
-		pngwtran.c pngmem.c pngerror.c pngpread.c
+SHLIB_MINOR=	1.2.20rc4
+SRCS=	png.c pngset.c pngget.c pngrutil.c pngtrans.c pngwutil.c \
+	pngread.c pngrio.c pngwio.c pngwrite.c pngrtran.c \
+	pngwtran.c pngmem.c pngerror.c pngpread.c
 INCS=	png.h pngconf.h
 MAN=	libpng.3 libpngpf.3 png.5
 
diff --git a/scripts/makefile.netbsd b/scripts/makefile.netbsd
index c1219e2..26be87f 100644
--- a/scripts/makefile.netbsd
+++ b/scripts/makefile.netbsd
@@ -14,10 +14,10 @@
 
 LIB=	png
 SHLIB_MAJOR=	3
-SHLIB_MINOR=	1.2.20rc3
-SRCS=	pnggccrd.c png.c pngset.c pngget.c pngrutil.c pngtrans.c pngwutil.c \
-		pngread.c pngrio.c pngwio.c pngwrite.c pngrtran.c \
-		pngwtran.c pngmem.c pngerror.c pngpread.c
+SHLIB_MINOR=	1.2.20rc4
+SRCS=	png.c pngset.c pngget.c pngrutil.c pngtrans.c pngwutil.c \
+	pngread.c pngrio.c pngwio.c pngwrite.c pngrtran.c \
+	pngwtran.c pngmem.c pngerror.c pngpread.c
 INCS=	png.h pngconf.h
 MAN=	libpng.3 libpngpf.3 png.5
 
diff --git a/scripts/makefile.nommx b/scripts/makefile.nommx
index a2d840b..e1b8c01 100644
--- a/scripts/makefile.nommx
+++ b/scripts/makefile.nommx
@@ -7,7 +7,7 @@
 # Library name:
 LIBNAME = libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.openbsd b/scripts/makefile.openbsd
index 9e85b6f..9125962 100644
--- a/scripts/makefile.openbsd
+++ b/scripts/makefile.openbsd
@@ -8,10 +8,10 @@
 MANDIR= ${PREFIX}/man/cat
 
 SHLIB_MAJOR=	0
-SHLIB_MINOR=	1.2.20rc3
+SHLIB_MINOR=	1.2.20rc4
 
 LIB=	png
-SRCS=	png.c pngerror.c pnggccrd.c pngget.c pngmem.c pngpread.c \
+SRCS=	png.c pngerror.c pngget.c pngmem.c pngpread.c \
 	pngread.c pngrio.c pngrtran.c pngrutil.c pngset.c pngtrans.c \
 	pngwio.c pngwrite.c pngwtran.c pngwutil.c
 
diff --git a/scripts/makefile.sco b/scripts/makefile.sco
index daa5661..3827f51 100644
--- a/scripts/makefile.sco
+++ b/scripts/makefile.sco
@@ -9,7 +9,7 @@
 # Library name:
 LIBNAME = libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.sggcc b/scripts/makefile.sggcc
index 94f428c..58f7ec6 100644
--- a/scripts/makefile.sggcc
+++ b/scripts/makefile.sggcc
@@ -6,7 +6,7 @@
 # Library name:
 LIBNAME=libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.sgi b/scripts/makefile.sgi
index 371fc32..64a6595 100644
--- a/scripts/makefile.sgi
+++ b/scripts/makefile.sgi
@@ -6,7 +6,7 @@
 # Library name:
 LIBNAME=libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.so9 b/scripts/makefile.so9
index 4dace29..3b1a286 100644
--- a/scripts/makefile.so9
+++ b/scripts/makefile.so9
@@ -8,7 +8,7 @@
 
 # Library name:
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 LIBNAME = libpng12
 
diff --git a/scripts/makefile.solaris b/scripts/makefile.solaris
index daf96f8..01b10f4 100644
--- a/scripts/makefile.solaris
+++ b/scripts/makefile.solaris
@@ -8,7 +8,7 @@
 # Library name:
 LIBNAME = libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
diff --git a/scripts/makefile.solaris-x86 b/scripts/makefile.solaris-x86
index 7a9fc16..e4e2b74 100644
--- a/scripts/makefile.solaris-x86
+++ b/scripts/makefile.solaris-x86
@@ -8,7 +8,7 @@
 # Library name:
 LIBNAME = libpng12
 PNGMAJ = 0
-PNGMIN = 1.2.20rc3
+PNGMIN = 1.2.20rc4
 PNGVER = $(PNGMAJ).$(PNGMIN)
 
 # Shared library names:
@@ -69,7 +69,7 @@
 
 OBJS = png.o pngset.o pngget.o pngrutil.o pngtrans.o pngwutil.o \
 	pngread.o pngrio.o pngwio.o pngwrite.o pngrtran.o \
-	pngwtran.o pngmem.o pngerror.o pngpread.o pnggccrd.o
+	pngwtran.o pngmem.o pngerror.o pngpread.o
 
 OBJSDLL = $(OBJS:.o=.pic.o)
 
@@ -232,7 +232,6 @@
 pngmem.o pngmem.pic.o: png.h pngconf.h
 pngset.o pngset.pic.o: png.h pngconf.h
 pngget.o pngget.pic.o: png.h pngconf.h
-pnggccrd.o pnggccrd.pic.o: png.h pngconf.h
 pngread.o pngread.pic.o: png.h pngconf.h
 pngrtran.o pngrtran.pic.o: png.h pngconf.h
 pngrutil.o pngrutil.pic.o: png.h pngconf.h
diff --git a/scripts/makefile.vcawin32 b/scripts/makefile.vcawin32
index 4c2f3cc..99f5430 100644
--- a/scripts/makefile.vcawin32
+++ b/scripts/makefile.vcawin32
@@ -26,8 +26,7 @@
 OBJS1 = png$(O) pngerror$(O) pngget$(O) pngmem$(O) pngpread$(O)
 OBJS2 = pngread$(O) pngrio$(O) pngrtran$(O) pngrutil$(O) pngset$(O)
 OBJS3 = pngtrans$(O) pngwio$(O) pngwrite$(O) pngwtran$(O) pngwutil$(O)
-OBJS4 = pngvcrd$(O)
-OBJS  = $(OBJS1) $(OBJS2) $(OBJS3) $(OBJS4)
+OBJS  = $(OBJS1) $(OBJS2) $(OBJS3)
 
 # Targets
 all: libpng.lib
@@ -80,9 +79,6 @@
 pngwutil$(O): png.h pngconf.h
 	$(CC) -c $(CFLAGS) $*.c $(ERRFILE)
 
-pngvcrd$(O): png.h pngconf.h
-	$(CC) -c $(CFLAGS) $*.c $(ERRFILE)
-
 libpng.lib: $(OBJS)
 	-$(RM) $@
 	$(AR) $(ARFLAGS) -out:$@ $(OBJS) $(ERRFILE)
diff --git a/scripts/pngos2.def b/scripts/pngos2.def
index d6074f3..4a6b3f1 100644
--- a/scripts/pngos2.def
+++ b/scripts/pngos2.def
@@ -2,7 +2,7 @@
 ; PNG.LIB module definition file for OS/2
 ;----------------------------------------
 
-; Version 1.2.20rc3
+; Version 1.2.20rc4
 
 LIBRARY		PNG
 DESCRIPTION	"PNG image compression library for OS/2"
diff --git a/scripts/pngw32.def b/scripts/pngw32.def
index 05a97cf..b31fa79 100644
--- a/scripts/pngw32.def
+++ b/scripts/pngw32.def
@@ -5,7 +5,7 @@
 LIBRARY
 
 EXPORTS
-;Version 1.2.20rc3
+;Version 1.2.20rc4
   png_build_grayscale_palette  @1
   png_check_sig        @2
   png_chunk_error      @3